# What are some simple commands for notebooks?
CTRL+ENTER - run the current cell  
LEFT CLICK ON CELL/ENTER - start text editing cell  
LEFT CLICK OFF CELL/ESC - stop text editing cell  
b - create new code cell

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib as plt
import optuna
import pickle
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from pathlib import Path

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

In [None]:
df_train = pd.read_csv(input_path / "train.csv", index_col="id")
df_test = pd.read_csv(input_path / "test.csv", index_col="id")
df_preds_example = pd.read_csv(input_path / "sample_submission.csv")
df_train

In [None]:
for c in df_train.columns:
    if df_train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df_train[c].values) + list(df_test[c].values))
        df_train[c] = lbl.transform(df_train[c].values).astype("int32")
        df_test[c] = lbl.transform(df_test[c].values).astype("int32")

In [None]:
df_train.head()

In [None]:
df_target = df_train.pop('target')
X_train, X_val, y_train, y_val = train_test_split(df_train, df_target, test_size=0.5, random_state=43)
X_test = df_test
X_train

In [None]:
y_train

In [None]:
def get_model_xgb(X_train, X_val, y_train, y_val):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    param = {'tree_method':'gpu_hist'}
    num_round = 100
    return xgb.train(param, dtrain, num_round, early_stopping_rounds=10, evals=[(dval, "eval")])

In [None]:
xgb_model = get_model_xgb(X_train, X_val, y_train, y_val)

In [None]:
# Comparison

#Median: 0.889
#Linear regression: 0.870
#Boosting trees: 0.851

In [None]:
dtest = xgb.DMatrix(X_test)

In [None]:
preds_xgb = xgb_model.predict(dtest)
df_preds_example["target"] = preds_xgb
df_preds_example.to_csv("preds_xgb_solo.csv", index=False)

# If you are already familiar with boosting trees:
Have a look at the hyper parameter list for XGBoost here: https://xgboost.readthedocs.io/en/latest/parameter.html  
We will be doing a hyper parameter search later, so make a shortlist of what you think would be good to experiment with!  
# If you are not familiar with boosting trees:
I will run through a high level overview

In [None]:
dummy_df = pd.DataFrame([
    ["Yes", 0, 0.1, 0, 0, 0.9],
    ["Yes", 0, 0.15, 0, 5, 0.96],
    ["Yes", 50, 0.5, 3, 0, 0.5],
    ["No", 10, 0.4, 5, 5, 0.2],    
    ["No", 11, 0.6, 25, 0, 0.3],
], columns=["Is ferrari", "Years on license", "Crime rate", "Car age", "Penalty points", "Chance of insurance claim"])
dummy_df

In [None]:
# Decision Tree
def GetChance(row):
    if row["Is ferrari"] == "Yes":
        if row["Years on license"] < 25:
            return 0.93
        else:
            return 0.5
    else:
        return 0.25

In [None]:
# Boosting tree
def GetFerrariPenalty(row):
    if row["Is ferrari"] == "Yes":
        if row["Years on license"] < 25:
            return 0.93
        else:
            return 0.5
    else:
        return 0.25

dummy_df_residualised_1 = pd.DataFrame([
    ["Yes", 0, 0.1, 0, 0, -0.03],
    ["Yes", 0, 0.15, 0, 5, 0.03],
    ["Yes", 50, 0.5, 3, 0, 0.0],
    ["No", 10, 0.4, 5, 5, -0.05],    
    ["No", 11, 0.6, 25, 0, 0.05],
], columns=["Is ferrari", "Years on license", "Crime rate", "Car age", "Penalty points", "Chance of insurance claim"])
dummy_df_residualised_1

In [None]:
def GetGoodDrinvingBoost(row):
    if row["Penalty points"] > 3:
        return 0.04
    else:
        return -0.04
    
dummy_df_residualised_2 = pd.DataFrame([
    ["Yes", 0, 0.1, 0, 0, 0.01],
    ["Yes", 0, 0.15, 0, 5, -0.01],
    ["Yes", 50, 0.5, 3, 0, -0.04],
    ["No", 10, 0.4, 5, 5, -0.01],    
    ["No", 11, 0.6, 25, 0, 0.01],
], columns=["Is ferrari", "Years on license", "Crime rate", "Car age", "Penalty points", "Chance of insurance claim"])
dummy_df_residualised_2

In [None]:
def get_model_lgb(X_train, X_val, y_train, y_val):
    cat_columns = [f'cat{cat_index}' for cat_index in range(10)]
    
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_columns)
    validation_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_columns)
    param = {'objective': 'regression', 'metric':'rmse'}
    
    return lgb.train(param, train_data, 1000,  valid_sets=validation_data, early_stopping_rounds=10, categorical_feature=cat_columns)

In [None]:
lgb_model = get_model_lgb(X_train, X_val, y_train, y_val)

In [None]:
def get_model_cat(X_train, X_val, y_train, y_val):
    cat_columns = [f'cat{cat_index}' for cat_index in range(10)]
    model = CatBoostRegressor(
        eval_metric='RMSE',
        task_type='GPU',
        iterations=1000,
        od_type="Iter",
        od_wait=10,
        learning_rate=0.3,
        metric_period=25
    )
    
    model.fit(X_train, y=y_train, cat_features=cat_columns, eval_set=(X_val, y_val))
    return model

In [None]:
cat_model = get_model_cat(X_train, X_val, y_train, y_val)

In [None]:
preds_xgb = xgb_model.predict(dtest)
preds_lgb = lgb_model.predict(X_test)
preds_cat = cat_model.predict(X_test)

df_preds_example["target"] = np.mean(np.vstack([preds_xgb, preds_lgb, preds_cat]), axis=0)
df_preds_example.to_csv("preds_combined_mean.csv", index=False)

df_preds_example["target"] = np.median(np.vstack([preds_xgb, preds_lgb, preds_cat]), axis=0)
df_preds_example.to_csv("preds_combined_median.csv", index=False)

In [None]:
def objective(trial, X_train, X_val, y_train, y_val):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    max_depth = trial.suggest_int('max_depth', 3, 6)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 0.5)
    subsample = trial.suggest_uniform('subsample', 0.1, 1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1)
    num_parallel_tree = trial.suggest_int('num_parallel_tree', 1, 2)
    min_child_weight = trial.suggest_uniform('min_child_weight', 1, 250)
    lambd = trial.suggest_uniform('lambd', 1, 1.1)
    alpha = trial.suggest_uniform('alpha', 0, 0.2)
    num_round = 1000
    
    param = {'max_depth':max_depth,
             'learning_rate':learning_rate,
             'objective':'reg:squarederror',
             'subsample':subsample,
             'colsample_bytree':colsample_bytree,
             'num_parallel_tree':num_parallel_tree,
             'lambda':lambd,
             'alpha':alpha,
            'tree_method':'gpu_hist'}
    
    bst = xgb.train(param, dtrain, num_round, early_stopping_rounds=2, evals=[(dval, "eval")])
    return float(bst.eval(dval).split(":")[1])

In [None]:
study = optuna.create_study()

study.optimize(lambda trial: objective(trial, X_train, X_val, y_train, y_val), n_trials=5)

In [None]:
study.best_params

Possible future steps
# Feature engineering
# Cross validation
# Adding a Neural Network to the ensemble(Pytorch/Tensorflow)
# Model stacking