In [None]:
import numpy as np
import seaborn as sns
import pandas as pd 
import optuna
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor, Ridge, TweedieRegressor, PoissonRegressor, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVR

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# Exploratory data analysis

In [None]:
f,ax = plt.subplots(figsize=(12,2))
plt.xticks(size = 14)
plt.yticks(size = 14)
bar1 =  ax.barh('train', train.shape[0], color="indianred")
bar2 =  ax.barh('test', test.shape[0], color="green")
ax.set_title("Train and test datasets size comparison", fontsize=20, pad=5)
ax.bar_label(bar1, ["{0:.2f}%".format((train.shape[0]/(train.shape[0]+test.shape[0]))*100)], label_type="center",
             fontsize=20, color="white", weight="bold")
ax.bar_label(bar2,["{0:.2f}%".format((test.shape[0]/(train.shape[0]+test.shape[0]))*100)], label_type="center",
             fontsize=20, color="white", weight="bold")
plt.show()

In [None]:
train.head()

In [None]:
print(" Shape ".center(100,'*'))
print('Rows: {}'.format(train.shape[0]))
print('Columns: {}'.format(train.shape[1]))
print(" Head ".center(100,'*'))
print(train.head())
print(" Types ".center(100,'*'))
print(train.dtypes)
print(" Missing values ".center(100,'*'))
print("Missing values %:   {}%".format(train.isna().sum().sum()/(train.shape[0]*train.shape[1])*100))
print(train.isna().sum())
print(' Duplicated'.center(100,'*'))
print(train.duplicated().sum())

In [None]:
to_desc = train.drop(columns = ['id'])
desc = to_desc.describe().T.drop(columns = ['count'])

desc_df = pd.DataFrame(index= [col for col in to_desc.columns], 
                   columns= desc.describe().T.columns.tolist().remove('count'), data= desc )

f,ax = plt.subplots(figsize=(10,50))
sns.heatmap(desc_df, annot=True,cmap = "coolwarm", fmt= '.0f',
            ax=ax,linewidths = 4, cbar = True,
            annot_kws={"size": 8})
ax.xaxis.tick_top()
plt.xticks(size = 14)
plt.yticks(size = 14, rotation = 0)
plt.title("Descriptive Statistics", size = 16)
plt.show()

# author Dmitry Uarov https://www.kaggle.com/dmitryuarov/tps-aug-2021-eda-cb-vs-xgb-vs-lgbm 
fig = plt.figure(figsize = (20, 80))
fig.suptitle('Train & Test', fontsize=16 , y =1)
for i in range(len(train.columns.tolist()[1:101])):
    ax = plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[1:101][i], size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[1:101][i]], color = '#16a5b8', shade = True, alpha = 0.5, linewidth = 0.3, edgecolor = 'black',ax=ax,label = 'train')
    sns.kdeplot(test[test.columns.tolist()[1:101][i]], color = '#14a314', shade = True, alpha = 0.5, linewidth = 0.3, edgecolor = 'black',label = 'test',  ax = ax)
    plt.ylabel('')
    plt.xlabel('')
    if i == 0:
        fig.legend(prop={'size': 10})
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.show()

In [None]:
# correlation heatmap is too large


# df_train = train.drop(columns = ['id','loss' ])
# corr_target_pearson = df_train.corr(method='pearson')
# corr_target_spearman = df_train.corr(method='spearman')

# fig = plt.figure(figsize = (25,21))
# sns.heatmap(corr_target_pearson, annot=True, cmap='YlGn',linewidth = 0.2, vmin=-1, vmax=+1, fmt = ".1f")
# plt.xticks(rotation=45)
# plt.yticks(rotation=0)
# plt.title('Pearson Correlation')
# plt.show()

# fig = plt.figure(figsize = (25,21))
# sns.heatmap(corr_target_spearman, annot=True, cmap='YlGn',linewidth = 0.2, vmin=-1, vmax=+1, fmt = ".1f")
# plt.xticks(rotation=45)
# plt.title('Spearman Correlation')
# plt.show()

## Target

In [None]:
g = sns.displot(data = train, x = train.loss, color='forestgreen',kde=True, stat = 'density',aspect=3)
plt.show()

# Preprocessing

In [None]:
target = train.loss.copy()
target

In [None]:
df_train = train.drop(columns = ['id','loss'])
df_train

In [None]:
df_test = test.drop(columns = ['id'])
df_test

**Scalers on data**

In [None]:
features = []
for feature in df_train.columns:
    features.append(feature)

In [None]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
pca = PCA()
pca.fit(df_train.to_numpy())

plt.figure(figsize =(12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')  # for each component
plt.title('Segmentation Dataset Explained Variance')
plt.show(block=True)

In [None]:
pca = PCA(n_components=42)
pca.fit(df_train.to_numpy())
df_train_pca = pca.transform(df_train)

**PCA is not working well here.**

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(df_train.to_numpy(), target.to_numpy())
# Plotting the Cumulative Summation of the Explained Variance
plt.figure(figsize =(12,8))
plt.plot(np.cumsum(lda.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')  # for each component
plt.title('Segmentation Dataset Explained Variance')
plt.show(block=True)

In [None]:
lda = LinearDiscriminantAnalysis(n_components=42)
lda.fit(df_train.to_numpy(), target.to_numpy())
df_train_lda = lda.transform(df_train)

**RondomForest**

In [None]:
def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 20)
    rf_n_estimators = trial.suggest_int('n_estimators', 40, 150)
    regressor_obj = RandomForestRegressor(max_depth=rf_max_depth, n_estimators = rf_n_estimators, n_jobs=-1)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.


In [None]:
# Trial 0 finished with value: 7.925259211426888 and parameters: {'rf_max_depth': 13, 'n_estimators': 97}. Best is trial 0 with value: 7.925259211426888.
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'RFR',direction="minimize")
study.optimize(objective, n_trials=40)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**RF very slow even after dimensionality reduction.\
PCA < LDA**

**ExtraRondomForest**

In [None]:
def objective2(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 20)
    rf_n_estimators = trial.suggest_int('n_estimators', 40, 200)
    regressor_obj = ExtraTreesRegressor(max_depth=rf_max_depth, n_estimators = rf_n_estimators,n_jobs=-1)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
# Trial 0 finished with value: 7.924378637627466 and parameters: {'rf_max_depth': 17, 'n_estimators': 96}. Best is trial 0 with value: 7.924378637627466.
# Trial 12 finished with value: 7.92502305558478 and parameters: {'rf_max_depth': 15, 'n_estimators': 99}. Best is trial 11 with value: 7.923335363078516.
# Number of finished trials: 50
# Best trial: score 7.915917139032413, params {'rf_max_depth': 10, 'n_estimators': 149}
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'ERFR',direction="minimize")
study.optimize(objective2, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**SGDregressor**

In [None]:
def objective3(trial):
    sgd_loss = trial.suggest_categorical("loss", ["squared_loss", "huber","epsilon_insensitive","squared_epsilon_insensitive"])
    sgd_penalty = trial.suggest_categorical("penatly", ['l1', 'l2', 'elasticnet'] )
    sgd_alpha = trial.suggest_float('alpha', 0.0001, 1000)
    sgd_learning_rate = trial.suggest_categorical("learning_rate", ['constant', 'optimal', 'invscaling', 'adaptive'] )
    sgd_eta0 = trial.suggest_int('eta0', 1, 100)
    regressor_obj = SGDRegressor(loss=sgd_loss, penalty = sgd_penalty,alpha=sgd_alpha,learning_rate=sgd_learning_rate,eta0=sgd_eta0)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'SGD',direction="minimize")
study.optimize(objective3, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**Ridge**

In [None]:
def objective4(trial):
    ridge_alpha = trial.suggest_float('alpha', 1, 10)

    regressor_obj = Ridge(alpha=ridge_alpha, random_state=18)
    

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)
    
        
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'ridge',direction="minimize")
study.optimize(objective4, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**TweedieRegressor**

In [None]:
def objective5(trial):
    tr_alpha = trial.suggest_float('alpha', 1, 10)
    tr_power = trial.suggest_float('power', 1, 2)
    tr_max_iter = trial.suggest_int('max_iter', 100, 1000)
    
    regressor_obj = TweedieRegressor(power = tr_power, alpha = tr_alpha, max_iter = tr_max_iter)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'TweedieR',direction="minimize")
study.optimize(objective5, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**PoissonRegressor**

In [None]:
def objective6(trial):
    pp_alpha = trial.suggest_float('alpha', 0.01, 10)
    pp_max_iter = trial.suggest_int('max_iter', 100, 1000)
    regressor_obj = PoissonRegressor(alpha = pp_alpha, max_iter = pp_max_iter)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'poiss/gamma',direction="minimize")
study.optimize(objective6, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

**ElasticNet**

In [None]:
def objective7(trial):
    net_alpha = trial.suggest_float('alpha', 1, 100)
    net_ratio = trial.suggest_float('l1_ratio', 0, 1)
    net_max_iter = trial.suggest_int('max_iter', 100, 1000)
    regressor_obj = ElasticNet(alpha = net_alpha, l1_ratio = net_ratio, max_iter = net_max_iter, random_state=18)

    X_train, X_val, y_train, y_val = train_test_split(df_train_lda, target, random_state=18)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = np.sqrt(mean_squared_error(y_val, y_pred))

    return error  # An objective value linked with the Trial object.

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(study_name = 'Net',direction="minimize")
study.optimize(objective7, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))