In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Performance Measure
from sklearn.metrics import mean_squared_error
def evaluate(pred, y_test):
    mse = mean_squared_error(pred, y_test)
    return np.sqrt(mse)

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
%matplotlib inline
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# looking for correlations
corr_matrix = data.corr()

corr_matrix['target'].sort_values(ascending=False)

In [None]:
target = data.target
data = data.drop(['id','target'], axis=1)

In [None]:
print(data.shape)
print(target.shape)

# **Training and Evaluating on the Training set**

In [None]:
from sklearn.model_selection import cross_val_score
def train_evaluate(model, data=data, target=target):
    scores = cross_val_score(model, data, target,
                            scoring='neg_mean_squared_error', cv=10)
    scores = np.sqrt(-scores)
    print("Scores:\t", scores)
    print("Mean:\t", scores.mean())
    print("Standard Deviation:", scores.std())

## LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
train_evaluate(lin_reg, data)

## XGBoost

In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(tree_method='gpu_hist')
train_evaluate(xgb_reg)

In [None]:
# hyperparameter tunning using optuna
import optuna
from sklearn.model_selection import train_test_split

def objective(trial, data=data, target=target):
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
    param = {
        'tree_method': 'gpu_hist', # uses GPU for training
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_categorical("n_estimators", [150, 200, 300, 3000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
    }
    
    model = XGBRegressor(**param)
    model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=100, verbose=False)
    preds = model.predict(x_test)
    mse = mean_squared_error(y_test, preds)
    return np.sqrt(mse)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
#plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [None]:
#plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
'''plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
went and which parts of the space were explored more.'''
optuna.visualization.plot_slice(study)

In [None]:
#plot_contour: plots parameter interactions on an interactive chart. You can choose which hyperparameters you would like to explore.
optuna.visualization.plot_contour(study, params=[
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
#Visualize empirical distribution function
optuna.visualization.plot_edf(study)

### Create a XGBoost regressor model with best parameters

In [None]:
best_trial = study.best_params
best_trial['tree_method'] = 'gpu_hist'
best_trial

In [None]:
xgb_reg = XGBRegressor(**best_trial)
train_evaluate(xgb_reg)

In [None]:
# train on whole dataset
xgb_reg = XGBRegressor(**best_trial)
xgb_reg.fit(data, target)

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sample = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

In [None]:
test.head()

In [None]:
sample.head()

In [None]:
test_data = test.drop('id', axis=1)
preds = xgb_reg.predict(test_data)
submission = pd.DataFrame({'id':test.id, 'target':preds})
submission.to_csv('submission.csv')

In [None]:
submission.head()