In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
%matplotlib inline

In [None]:
# Read CSV file and print head
train_ds = pd.read_csv("data/Tabular Playground Series - Feb 2021/train.csv")
train_ds.drop(["id"], axis=1, inplace=True)
train_ds.head()

In [None]:
# Get table info
train_ds.info()

In [None]:
# Get count of data by column
train_ds.count()

In [None]:
# Check for null values
train_ds.isnull().sum()

In [None]:
# View target histogram
train_ds["target"].plot(kind="hist", bins=30)
plt.title("Target - Histogram")
plt.show()

In [None]:
# Boxplot for data visualization
train_ds["target"].plot(kind="box")
plt.title("Target - Boxplot")
plt.show()

In [None]:
# Check for outliers
outliers = train_ds[(train_ds["target"] < 5) | (train_ds["target"] > 10)]
print(len(outliers))

In [None]:
# Remove outliers
train_ds = train_ds[(train_ds["target"] > 5) & (train_ds["target"] < 10)]
len(train_ds)

In [None]:
# Recheck outliers
train_ds["target"].plot(kind="box")
plt.title("Target - Boxplot")
plt.show()

In [None]:
train_ds.head()

In [None]:
# Get all columns with float values. Exclude Strings to plot histogram.
feature_list_num = [
    "cont0",
    "cont1",
    "cont2",
    "cont3",
    "cont4",
    "cont5",
    "cont6",
    "cont7",
    "cont8",
    "cont9",
    "cont10",
    "cont11",
    "cont12",
    "cont13",
]
feature_list_num

In [None]:
# Histogram for each numeric feature

fig, axs = plt.subplots(5, 3)
fig.set_size_inches(8, 8)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

axs[0, 0].hist(train_ds[feature_list_num[0]], bins=100)  # First histogram at 0x0
axs[0, 0].set_title(f"{feature_list_num[0]}")

axs[0, 1].hist(train_ds[feature_list_num[1]], bins=100)  # Second histogram at 0x1
axs[0, 1].set_title(f"{feature_list_num[1]}")

axs[0, 2].hist(train_ds[feature_list_num[2]], bins=100)
axs[0, 2].set_title(f"{feature_list_num[2]}")

axs[1, 0].hist(train_ds[feature_list_num[3]], bins=100)
axs[1, 0].set_title(f"{feature_list_num[3]}")

axs[1, 1].hist(train_ds[feature_list_num[4]], bins=100)
axs[1, 1].set_title(f"{feature_list_num[4]}")

axs[1, 2].hist(train_ds[feature_list_num[5]], bins=100)
axs[1, 2].set_title(f"{feature_list_num[5]}")

axs[2, 0].hist(train_ds[feature_list_num[6]], bins=100)
axs[2, 0].set_title(f"{feature_list_num[6]}")

axs[2, 1].hist(train_ds[feature_list_num[7]], bins=100)
axs[2, 1].set_title(f"{feature_list_num[7]}")

axs[2, 2].hist(train_ds[feature_list_num[8]], bins=100)
axs[2, 2].set_title(f"{feature_list_num[8]}")

axs[3, 0].hist(train_ds[feature_list_num[9]], bins=100)
axs[3, 0].set_title(f"{feature_list_num[9]}")

axs[3, 1].hist(train_ds[feature_list_num[10]], bins=100)
axs[3, 1].set_title(f"{feature_list_num[10]}")

axs[3, 2].hist(train_ds[feature_list_num[11]], bins=100)
axs[3, 2].set_title(f"{feature_list_num[11]}")

axs[4, 0].hist(train_ds[feature_list_num[12]], bins=100)
axs[4, 0].set_title(f"{feature_list_num[12]}")

axs[4, 1].hist(train_ds[feature_list_num[13]], bins=100)
axs[4, 1].set_title(f"{feature_list_num[13]}")

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

plt.suptitle("Histograms of Numerical Features", size=14)

In [None]:
# Correlation Matrix
sns.heatmap(train_ds.corr(), annot=True, cmap="RdYlGn", linewidths=0.2)
fig = plt.gcf()
fig.set_size_inches(20, 12)
fig.show()

In [None]:
feature_list_cat = [
    "cat0",
    "cat1",
    "cat2",
    "cat3",
    "cat4",
    "cat5",
    "cat6",
    "cat7",
    "cat8",
    "cat9",
]
feature_list_cat

In [None]:
sns.countplot(data=train_ds, x=feature_list_cat[0])
plt.show()

In [None]:
fig, axs = plt.subplots(4, 3)
fig.set_size_inches(12, 9)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.subplots_adjust(wspace=0.4, hspace=0.5)

sns.countplot(data=train_ds, x=feature_list_cat[0], ax=axs[0, 0])
axs[0, 0].title.set_text(feature_list_cat[0])
axs[0, 0].set(xlabel=None)
axs[0, 0].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[1], ax=axs[0, 1])
axs[0, 1].title.set_text(feature_list_cat[1])
axs[0, 1].set(xlabel=None)
axs[0, 1].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[2], ax=axs[0, 2])
axs[0, 2].title.set_text(feature_list_cat[2])
axs[0, 2].set(xlabel=None)
axs[0, 2].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[3], ax=axs[1, 0])
axs[1, 0].title.set_text(feature_list_cat[3])
axs[1, 0].set(xlabel=None)
axs[1, 0].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[4], ax=axs[1, 1])
axs[1, 1].title.set_text(feature_list_cat[4])
axs[1, 1].set(xlabel=None)
axs[1, 1].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[5], ax=axs[1, 2])
axs[1, 2].title.set_text(feature_list_cat[5])
axs[1, 2].set(xlabel=None)
axs[1, 2].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[6], ax=axs[2, 0])
axs[2, 0].title.set_text(feature_list_cat[6])
axs[2, 0].set(xlabel=None)
axs[2, 0].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[7], ax=axs[2, 1])
axs[2, 1].title.set_text(feature_list_cat[7])
axs[2, 1].set(xlabel=None)
axs[2, 1].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[8], ax=axs[2, 2])
axs[2, 2].title.set_text(feature_list_cat[8])
axs[2, 2].set(xlabel=None)
axs[2, 2].set(ylabel=None)

sns.countplot(data=train_ds, x=feature_list_cat[9], ax=axs[3, 0])
axs[3, 0].title.set_text(feature_list_cat[9])
axs[3, 0].set(xlabel=None)
axs[3, 0].set(ylabel=None)

In [None]:
# Label Encoding to Categorical Columns
# encoder = OrdinalEncoder()
# train_ds["cat0"] = encoder.fit_transform(train_ds["cat0"].to_numpy().reshape(-1, 1))
# train_ds["cat1"] = encoder.fit_transform(train_ds["cat1"].to_numpy().reshape(-1, 1))
# train_ds["cat2"] = encoder.fit_transform(train_ds["cat2"].to_numpy().reshape(-1, 1))
# train_ds["cat3"] = encoder.fit_transform(train_ds["cat3"].to_numpy().reshape(-1, 1))
# train_ds["cat4"] = encoder.fit_transform(train_ds["cat4"].to_numpy().reshape(-1, 1))
# train_ds["cat5"] = encoder.fit_transform(train_ds["cat5"].to_numpy().reshape(-1, 1))
# train_ds["cat6"] = encoder.fit_transform(train_ds["cat6"].to_numpy().reshape(-1, 1))
# train_ds["cat7"] = encoder.fit_transform(train_ds["cat7"].to_numpy().reshape(-1, 1))
# train_ds["cat8"] = encoder.fit_transform(train_ds["cat8"].to_numpy().reshape(-1, 1))
# train_ds["cat9"] = encoder.fit_transform(train_ds["cat9"].to_numpy().reshape(-1, 1))

In [None]:
train_ds.head(100)

In [None]:
# Scatterplot Feature vs Target
sns.scatterplot(data=train_ds, x=feature_list_num[0], y="target")

In [None]:
fig, axs = plt.subplots(5, 3)
fig.set_size_inches(12, 9)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.subplots_adjust(wspace=0.4, hspace=0.5)

sns.scatterplot(
    data=train_ds, x=feature_list_num[0], y="target", ax=axs[0, 0], alpha=0.01
)
axs[0, 0].title.set_text(feature_list_num[0])
axs[0, 0].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[1], y="target", ax=axs[0, 1], alpha=0.01
)
axs[0, 1].title.set_text(feature_list_num[1])
axs[0, 1].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[2], y="target", ax=axs[0, 2], alpha=0.01
)
axs[0, 2].title.set_text(feature_list_num[2])
axs[0, 2].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[3], y="target", ax=axs[1, 0], alpha=0.01
)
axs[1, 0].title.set_text(feature_list_num[3])
axs[1, 0].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[4], y="target", ax=axs[1, 1], alpha=0.01
)
axs[1, 1].title.set_text(feature_list_num[4])
axs[1, 1].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[5], y="target", ax=axs[1, 2], alpha=0.01
)
axs[1, 2].title.set_text(feature_list_num[5])
axs[1, 2].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[6], y="target", ax=axs[2, 0], alpha=0.01
)
axs[2, 0].title.set_text(feature_list_num[6])
axs[2, 0].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[7], y="target", ax=axs[2, 1], alpha=0.01
)
axs[2, 1].title.set_text(feature_list_num[7])
axs[2, 1].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[8], y="target", ax=axs[2, 2], alpha=0.01
)
axs[2, 2].title.set_text(feature_list_num[8])
axs[2, 2].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[9], y="target", ax=axs[3, 0], alpha=0.01
)
axs[3, 0].title.set_text(feature_list_num[9])
axs[3, 0].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[10], y="target", ax=axs[3, 1], alpha=0.01
)
axs[3, 1].title.set_text(feature_list_num[10])
axs[3, 1].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[11], y="target", ax=axs[3, 2], alpha=0.01
)
axs[3, 2].title.set_text(feature_list_num[11])
axs[3, 2].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[12], y="target", ax=axs[4, 0], alpha=0.01
)
axs[4, 0].title.set_text(feature_list_num[12])
axs[4, 0].set(xlabel=None)

sns.scatterplot(
    data=train_ds, x=feature_list_num[13], y="target", ax=axs[4, 1], alpha=0.01
)
axs[4, 1].title.set_text(feature_list_num[13])
axs[4, 1].set(xlabel=None)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
all_feature = train_ds.drop("target", axis=1)
target_feature = train_ds[["target"]]

In [None]:
all_feature.head()

In [None]:
type(all_feature)

In [None]:
target_feature.head()

In [None]:
type(target_feature)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    all_feature, target_feature, test_size=0.3, random_state=42
)

In [None]:
x_train.head(50)

In [None]:
print(len(x_train))

In [None]:
x_test.head()

In [None]:
print(len(x_test))

In [None]:
y_train.head()

In [None]:
print(len(y_train))

In [None]:
print(len(y_test))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [None]:
# Pipeline
# Standardize numeric values and one-hot encode categorical values
# the Random Forest Regressor


# Standardize
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Ordinal Encoding
categorical_transformer = OrdinalEncoder()

# Combine as Pre-processor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_list_num),
        ("cat", categorical_transformer, feature_list_cat),
    ]
)

# Grid search parameters
grid_param = {
    "n_estimators": [200, 400, 600, 1000, 1500],
    "max_depth": [3, 5, 10, 50, 100, None],
    "min_samples_leaf": [1, 2, 4, 6, 10],
    "min_samples_split": [2, 5, 10, 20],
}

# Random Forest Regressor
# regressor = RandomForestRegressor(n_estimators=400, verbose=True, n_jobs=-1)

regressor = RandomForestRegressor(bootstrap=True, max_samples=0.01)

# Random Forest Cross Validation
regressor_cv = RandomizedSearchCV(
    regressor,
    n_iter=100,
    param_distributions=grid_param,
    cv=3,
    verbose=10,
    n_jobs=-1,
    refit=True,
)


# Full Prediction Pipeline
model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", regressor_cv)])

# Fit model
model.fit(x_train, np.array(y_train).ravel())

In [None]:
y_train_pred = model.predict(x_train)

# Model validation
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

print(
    "explained_variance_score: "
    + str(round(explained_variance_score(y_train, y_train_pred), 4))
)
print("r2_score: " + str(round(r2_score(y_train, y_train_pred), 4)))
print(
    "mean_absolute_error: " + str(round(mean_absolute_error(y_train, y_train_pred), 4))
)
print(
    "root_mean_squared_error: "
    + str(round(mean_squared_error(y_train, y_train_pred, squared=False), 4))
)

In [None]:
# Find best parameters
model["regressor"].best_params_

In [None]:
regressor_opt = RandomForestRegressor(
    n_estimators=1000,
    min_samples_split=5,
    min_samples_leaf=1,
    max_depth=None,
    verbose=10,
)

model = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor_opt", regressor_opt)]
)

model.fit(x_train, np.array(y_train).ravel())

In [None]:
y_train_pred = model.predict(x_train)

# Model validation
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

print(
    "explained_variance_score: "
    + str(round(explained_variance_score(y_train, y_train_pred), 4))
)
print("r2_score: " + str(round(r2_score(y_train, y_train_pred), 4)))
print(
    "mean_absolute_error: " + str(round(mean_absolute_error(y_train, y_train_pred), 4))
)
print(
    "root_mean_squared_error: "
    + str(round(mean_squared_error(y_train, y_train_pred, squared=False), 4))
)

In [None]:
y_test_pred = model.predict(x_test)
y_test_pred

In [None]:
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

print("mean_absolute_error :" + str(round(mean_absolute_error(y_test, y_test_pred), 4)))
print(
    "explained_variance_score :"
    + str(round(explained_variance_score(y_test, y_test_pred), 4))
)
print("r2_score :" + str(round(r2_score(y_test, y_test_pred), 4)))
print("mean_squared_error :" + str(round(mean_squared_error(y_test, y_test_pred), 4)))

In [None]:
x_output = pd.read_csv("data/Tabular Playground Series - Feb 2021/test.csv")
x_submit = pd.read_csv("data/Tabular Playground Series - Feb 2021/test.csv")
output_id = x_output["id"]
x_output = x_output.drop("id", axis=1)

In [None]:
x_output.head()

In [None]:
output_id

In [None]:
import datetime as datetime

now = datetime.datetime.now()

y_output = model.predict(x_output)
x_output["id"] = output_id
x_output["target"] = y_output

date_time = (
    "date_"
    + str(now.year)
    + "-"
    + str(now.month)
    + "-"
    + str(now.day)
    + "_time_"
    + str(now.hour)
    + "-"
    + str(now.minute)
)

x_output.to_csv(
    f"data/Tabular Playground Series - Feb 2021/{date_time}_test.csv", index=False
)

### 3. Extremly Randomized Decision Trees

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [None]:
scaler = StandardScaler()

ordinal = OrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", scaler, feature_list_num),
        ("ordinal", ordinal, feature_list_cat),
    ]
)

regressor = ExtraTreesRegressor()

# Grid Search Parameters
param_grid = {
    "n_estimators": [200, 400, 600, 1000],
    "max_features": [2, 5, 10, 15, 20, None],
    "max_depth": [5, 10, 50, 100, None],
    "min_samples_leaf": [0.01, 0.05, 0.1, None],
    "min_samples_split": [2, 5, 10, 20],
}

random_cv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=param_grid,
    n_iter=100,
    n_jobs=-1,
    cv=3,
    verbose=10,
    refit=True,
)

model = Pipeline(steps=[("preprocessor", preprocessor), ("random_cv", random_cv)])

model.fit(x_train, np.array(y_train).ravel())

In [None]:
model["random_cv"].best_params_

In [None]:
# Score on Training Data
model.score(x_train, y_train)

In [None]:
y_test_pred = model.predict(x_test)

from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

print(
    "explained_variance_score : "
    + str(round(explained_variance_score(y_test, y_test_pred), 4))
)
print("r2_score : " + str(round(r2_score(y_test, y_test_pred), 4)))
print(
    "mean_absolute_error : " + str(round(mean_absolute_error(y_test, y_test_pred), 4))
)
print("mean_squared_error : " + str(round(mean_squared_error(y_test, y_test_pred), 4)))

In [None]:
import datetime as datetime

now = datetime.datetime.now()

y_output = model.predict(x_output)
x_output["id"] = output_id
x_output["target"] = y_output

date_time = (
    "date_"
    + str(now.year)
    + "-"
    + str(now.month)
    + "-"
    + str(now.day)
    + "_time_"
    + str(now.hour)
    + "-"
    + str(now.minute)
)

x_output.to_csv(
    f"data/Tabular Playground Series - Feb 2021/extra-tree-regressor_{date_time}_test.csv",
    index=False,
)

In [None]:
x_output.head(50)

### XGBoost (eXtreme Gradient Boosting)

In [None]:
scalar = StandardScaler()

ordinal = OrdinalEncoder()

preprocessing = ColumnTransformer(
    transformers=[
        ("scalar", scalar, feature_list_num),
        ("ordinal", ordinal, feature_list_cat),
    ]
)

x_preprocessed = preprocessing.fit_transform(all_feature)

In [None]:
x_preprocessed

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_preprocessed, target_feature, test_size=0.3, random_state=0
)

In [None]:
x_train, x_test, y_train, y_test

In [None]:
import xgboost as xgb

# convert to DMatrix format
d_train_xgb = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
d_test_xgb = xgb.DMatrix(x_test, label=y_test, enable_categorical=True)

In [None]:
d_train_xgb, d_test_xgb

In [None]:
# baseline model

params = {
    # Parameters that we are going to tune.
    "max_depth": 6,
    "min_child_weight": 1,
    "eta": 0.3,
    "subsample": 1,
    "colsample_bytree": 1,
    "eval_metric": "rmse",
    # Other parameters
    "objective": "reg:squarederror",
}

num_boost_round = 300

model = xgb.train(
    params,
    d_train_xgb,
    num_boost_round=num_boost_round,
    evals=[(d_test_xgb, "Test")],
    early_stopping_rounds=10,
)

In [None]:
cv_results = xgb.cv(
    params,
    d_train_xgb,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={"rmse"},
    early_stopping_rounds=10,
)
cv_results

In [None]:
# Plot Curve
import seaborn as sns

temp = cv_results[cv_results["train-rmse-mean"] < 1]
sns.lineplot(data=temp, x=temp.index, y="train-rmse-mean", label="train error")
sns.lineplot(data=temp, x=temp.index, y="test-rmse-mean", label="test error")

### Find best parameters for XGBoost

In [None]:
grid_search_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2, 9)
    for min_child_weight in range(2, 9)
]

# Define initial best params and RMSE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in grid_search_params:
    print(
        "CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight)
    )

    # Update our parameters
    params["max_depth"] = max_depth
    params["min_child_weight"] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        d_train_xgb,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=3,
        metrics={"rmse"},
        early_stopping_rounds=10,
    )

    # Update best RME
    mean_rmse = cv_results["test-rmse-mean"].min()
    boost_rounds = cv_results["test-rmse-mean"].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth, min_child_weight)

print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
# Add best parameters
params["max_depth"] = 2
params["min_child_weight"] = 8

In [None]:
# Find best subsample and colsample
grid_search_params = [
    (subsample, colsample)
    for subsample in [i / 10.0 for i in range(7, 11)]
    for colsample in [i / 10.0 for i in range(7, 11)]
]

min_rmse = float("Inf")
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(grid_search_params):
    print("CV with subsample={}, colsample={}".format(subsample, colsample))

    # We update our parameters
    params["subsample"] = subsample
    params["colsample_bytree"] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        d_train_xgb,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=3,
        metrics={"rmse"},
        early_stopping_rounds=10,
    )

    # Update best score
    mean_rmse = cv_results["test-rmse-mean"].min()
    boost_rounds = cv_results["test-rmse-mean"].argmin()

    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))

    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample, colsample)

print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
# update parameter
params["subsample"] = 1.0
params["colsample_bytree"] = 0.7

In [None]:
# Find best learning rate eta

%time

min_rmse= float("Inf")
best_params = None
for eta in [1, .5, .3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    # We update our parameters
    params['eta'] = eta
    
    # Run and time CV
    %time cv_results = xgb.cv(params, d_train_xgb, num_boost_round=num_boost_round, seed=42, nfold=3, metrics=['rmse'],early_stopping_rounds=10)
    
    # Update best score
    mean_rsme = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
        
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

In [None]:
params["eta"] = 1

In [None]:
# final parameter
params

In [None]:
final_model = xgb.train(
    params,
    d_train_xgb,
    num_boost_round=num_boost_round,
    evals=[(d_test_xgb, "Test")],
    early_stopping_rounds=10,
)

print(
    "Best RMSE: {:.2f} in {} rounds".format(
        final_model.best_score, final_model.best_iteration + 1
    )
)

In [None]:
# prediction with final_model
x_output_pre = preprocessor.transform(x_submit)

y_submit = final_model.predict(xgb.DMatrix(x_output_pre))
y_submit

In [None]:
# export prediction
import datetime as datetime

export_df = pd.DataFrame()
export_df["id"] = pd.read_csv("data/Tabular Playground Series - Feb 2021/test.csv")[
    "id"
]
export_df["target"] = y_submit
now = datetime.datetime.now()
name_add = (
    "date_"
    + str(now.year)
    + "-"
    + str(now.month)
    + "-"
    + str(now.day)
    + "_time_"
    + str(now.hour)
    + "-"
    + str(now.minute)
)
export_df.to_csv(
    f"data/Tabular Playground Series - Feb 2021/xgboost_tuned_{name_add}.csv",
    index=False,
)

### Bagging and other bootstrapping models (simple)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

# Standardize
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# One-Hot Encoding
categorical_transformer = OrdinalEncoder()

# Combine as Pre-processor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_list_num),
        ("cat", categorical_transformer, feature_list_cat),
    ]
)

x_pre = preprocessor.fit_transform(all_feature)
x_output_pre = preprocessor.transform(x_submit)

In [None]:
# Train-test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_pre, target_feature, test_size=0.4, shuffle=True, random_state=0
)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [None]:
print(x_train.shape)

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

# Bagging

bagging_reg = BaggingRegressor(
    DecisionTreeRegressor(),
    n_estimators=500,
    bootstrap=True,
    max_samples=0.3,
    # bootstrap_features = True, max_features=1.0,
    n_jobs=-1,
    verbose=3,
)

bagging_reg.fit(x_train, y_train)
y_pred = bagging_reg.predict(x_test)
print("RMSE: " + str(mean_squared_error(y_test, y_pred, squared=False)))

In [None]:
import datetime as datetime

y_submit = bagging_reg.predict(x_output_pre)

export_df = pd.DataFrame()
export_df["id"] = pd.read_csv("data/Tabular Playground Series - Feb 2021/test.csv")[
    "id"
]
export_df["target"] = y_submit
now = datetime.datetime.now()
name_add = (
    "date_"
    + str(now.year)
    + "-"
    + str(now.month)
    + "-"
    + str(now.day)
    + "_time_"
    + str(now.hour)
    + "-"
    + str(now.minute)
)
export_df.to_csv(
    f"data/Tabular Playground Series - Feb 2021/bagging_simple_{name_add}.csv",
    index=False,
)

In [None]:
# Pasting

pasting_reg = BaggingRegressor(
    DecisionTreeRegressor(),
    n_estimators=500,
    bootstrap=False,
    max_samples=0.3,
    # bootstrap_features = True, max_features=1.0,
    n_jobs=-1,
    verbose=3,
)

pasting_reg.fit(x_train, y_train)
y_pred = pasting_reg.predict(x_test)
print("RMSE: " + str(mean_squared_error(y_test, y_pred, squared=False)))