# Tabular playground - February

# Optimization of hyperparameters

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

## Read in the data files


In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
print("Train dataset")
display(train.head())
test = pd.read_csv(input_path / 'test.csv', index_col='id')
print("Test dataset")
display(test.head())
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
print("Sample submission")
display(submission.head())

In [None]:
train.describe()

In [None]:
train.info()

## Visualizations

In [None]:

train.hist(bins=100,figsize=(20,15))


In [None]:
categorical = [cat for cat in train.columns if train[cat].dtype =='object']
print("Categorical features: ",categorical)
numerical = [num for num in train.columns if train[num].dtype =='float64']
print("Numerical features: ", numerical)

In [None]:

corr_matrix=train.corr()
corr = corr_matrix
#corr = corr_matrix.drop(['id', 'target','bt']).drop(['id', 'target','bt'], axis=1)
plt.figure(figsize=(16,10))
sns.heatmap(corr,  annot=True, vmin=-1, vmax=1,cmap='coolwarm')

There is very little correlation with the target.

## Handling outliers

In [None]:
train.plot(figsize = (12,8))
plt.show()

In [None]:
# box plot
train.boxplot(figsize = (12,8))
plt.show()

In [None]:

# convert outliers to quantiles
train[numerical]=train[numerical].clip(lower=train[numerical].quantile(0.0001), upper=train[numerical].quantile(0.9999), axis=1)


In [None]:
# box plot
train.boxplot(figsize = (12,8))
plt.show()

In [None]:
train.plot(figsize = (12,8))
plt.show()


## Dealing with categorical features


In [None]:
#Visualization of categorical features

fig, ax = plt.subplots(10,2, figsize=(15, 25))
i=0
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(x=train[variable], ax=ax[i,0])
    ax[i,0].set_yscale("log")
    for label in subplot.get_xticklabels():
        label.set_rotation(90)
    sns.boxplot(x=variable, y='target', data=train, ax=ax[i,1])
    ax[i,1].set_ylim([4, 9])
    i=i+1

In [None]:
#label encoder
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)
        
display(train.head())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60, random_state=42)


# Choosing the model

In [None]:

def check_models(X_train, y_train, X_test, y_test):
    model_names = [ "Random Forest", "XGB"]

    models = [
        RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42),
        XGBRegressor(random_state=42)]

    for name, model in zip(model_names, models):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(name, mean_squared_error(y_test, y_pred, squared=False))


In [None]:

#check_models(X_train, y_train, X_test, y_test)

'''Results:
Random Forest 0.8647820794446256
XGB 0.8514728317890382'''


# It look like XGBRegressor did the best. 

# Hyperparameter tuning with GridSearchCV

In [None]:
xgbreg=XGBRegressor()

from sklearn.model_selection import GridSearchCV
param_grid = [
{'eta':[0.3, 0.4],
 'n_estimators':[500,1000,1500],
  'n_jobs':[3,4,5],
 'max_depth': [ 3, 4],
 'alpha':[2,3,4],
 'tree_method': ['gpu_hist'],
 'gpu_id': [0],
 'predictor': ['gpu_predictor'],
 'seed': [42]
},
]

#grid_search = GridSearchCV(xgbreg, param_grid, cv=3, scoring='neg_mean_squared_error')
#grid_search.fit(X_train, y_train)

#grid_search.best_params_


"""Results:

{'alpha': 4,
 'eta': 0.3,
 'gpu_id': 0,
 'max_depth': 3,
 'n_estimators': 500,
 'n_jobs': 3,
 'predictor': 'gpu_predictor',
 'seed': 42,
 'tree_method': 'gpu_hist'}"""

Let's use the best params and train model on all dataset.

In [None]:

model = XGBRegressor(n_estimators=500, n_jobs=3, alpha=4, eta=0.3, gpu_id=0, max_depth=3, predictor='gpu_predictor', seed=42, tree_method='gpu_hist')
'''model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)],
             verbose=False)'''
model.fit(train, target)
submission['target'] = model.predict(test)
submission.to_csv('XGB.csv')

In [None]:
y_pred=model.predict(X_test)
print(mean_squared_error(y_test, y_pred, squared=False))