# Basic Starter

This notebook is aimed to be an initial modeling of the problem, you will find:
1. A basic EDA, with no in-depth exploring, using mainly SNS and PLT
2. Building a XGB model
3. Tuning its parameters with HOPT
4. Evaluation regression results

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgbm
import matplotlib.pyplot as plt
import shap
import datatable as dt
from sklearn.metrics import mean_squared_error
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
import lightgbm as lgb

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')

In [None]:
df.shape

## Checking for nulls

In [None]:
df.isnull().sum()

We won't need to think in a strategy to fill null values

## Now let's explore target variable and our features

In [None]:
sns.histplot(df['target'])

In [None]:
continuous = [x for x in df.columns if 'cont' in x]
cat = [x for x in df.columns if 'cat' in x]
features = [x for x in df.columns if 'target' not in x]

In [None]:
fig, axs = plt.subplots(3, 4,figsize=(20,10))
i, j = 0, 0
for c in cat:
    sns.histplot(df[c],ax=axs[i, j])
    i+=1
    if i ==3:
        i=0
        j+=1

In [None]:
fig, axs = plt.subplots(3, 4,figsize=(20,10))
i, j = 0, 0
for c in cat:
    sns.violinplot(x=c, y='target',data=df,ax=axs[i, j])
    i+=1
    if i ==3:
        i=0
        j+=1

In [None]:
fig, axs = plt.subplots(4, 4,figsize=(20,12))
i, j = 0, 0
for c in continuous:
    sns.histplot(df[c],ax=axs[i, j])
    i+=1
    if i ==4:
        i=0
        j+=1

In [None]:
fig, axs = plt.subplots(4, 4,figsize=(20,10))
i, j = 0, 0
for c in continuous:
    sns.scatterplot(x=c, y='target',data=df,ax=axs[i, j])
    i+=1
    if i ==4:
        i=0
        j+=1

## Summarizing EDA:
* By LOOKING at the target distribution we can guess that it might be a bi-modal distribution centered around 6 and 8, this information might be usefull for optimizing our solution later.
* Some categorical features has major unbalance problems, however when we look at the violin plot we see that this unbalance has little influence in the target data. Some categories has more targets around the value 6 or 8 exclusively, but the majority seems to not have any relation with target
* The continuous features present multi-modal distributions, and they dont see to correlate with our target.

# Feature engineering

Let's try to create some relation with our continuos variables

In [None]:
from itertools import combinations

In [None]:
#Create a pairwise combination of all continuos variables (since they're only two this is pretty much OK)
cont_comb = [x for x in combinations(continuous, 2)]

In [None]:
gen_feat = []
for c in cont_comb:
    df['{} mean {}'.format(c[0], c[1])] = (df[c[0]] + df[c[1]])/2 #mean
    df['{} by {}'.format(c[0], c[1])] = (df[c[0]] +0.001) / (df[c[1]]+0.001 ) #Dividing
    
    gen_feat.append('{} mean {}'.format(c[0], c[1]))
    gen_feat.append('{} by {}'.format(c[0], c[1]))

In [None]:
df.shape

# Let's Model
First we'll need to sample our dataframe into train and test, since there's no information about time, we can simply randomly sample

## Sampling
Instead of train/test we'll use CV

In [None]:
def OHE(train, test, cat=None):
    # This functions returns a ONE HOT ENCODE from the train set, then the test set's OHE to have the same columns
    # Test's extra categories are ignored, missing categories are added anyway with a column of 0 in the test set
 
    train = pd.get_dummies(train, cat) 
    test = pd.get_dummies(test, cat) 
    
    return train.align(test, join='left', axis=1)

In [None]:
features = features + gen_feat

In [None]:
X = df[features]
y = df['target']
X = pd.get_dummies(X, cat) 

In [None]:
kf = KFold(n_splits=5)

### Converting to a matrix

### Optimizing with hyperOPT
   It's pretty straight forward: 
   1. Define the function, which receives the parameters to be optimized and returns the metric that we want to minimize (in this example I defined RMSE)
   2. Initiate the "search space" (which is the combination of our parameters and their range to be searched, I would advise to google and read the documentation of this functions)
   3. Initiate a Trials object
   4. Define fmin

In [None]:
def optimise(params):
    
    print(params)
    p = {'learning_rate': params['learning_rate'],
         'max_depth': params['max_depth'], 
         'gamma': params['gamma'], 
         'min_child_weight': params['min_child_weight'], 
         'subsample': params['subsample'], 
         'colsample_bytree': params['colsample_bytree'], 
         'verbosity': 0, 
         'objective': 'reg:squarederror',
         'eval_metric': 'rmse', 
         'tree_method': 'gpu_hist', #MAKE SURE TO HAVE YOUR GPU ON, otherwise just remove this line
         'random_state': 42,
        }
    
    score_te=[]
    
    for tr, te in kf.split(X):
        X_tr, X_te = X.loc[tr, :].values, X.loc[te, :].values
        y_tr, y_te = y.loc[tr].values, y.loc[te].values
        
        d_tr = xgb.DMatrix(X_tr, y_tr)
        d_val = xgb.DMatrix(X_te, y_te)
        
        clf = xgb.train(p, d_tr, params['n_round'], verbose_eval = False)
        val_pred = clf.predict(d_val)
        
        score_te.append(mean_squared_error(y_te, val_pred))

    return np.mean(score_te)

param_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3), 
               'max_depth': scope.int(hp.quniform('max_depth', 3, 8, 1)), 
               'gamma': hp.uniform('gamma', 0, 10), 
               'min_child_weight': hp.uniform('min_child_weight', 0, 10),
               'lambda': hp.uniform('lambda', 0, 10),
               'subsample': hp.uniform('subsample', 0.1, 1), 
               'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 0.8), 
               'n_round': scope.int(hp.quniform('n_round', 50, 150, 25))
              }

trials = Trials()

hopt = fmin(fn = optimise, 
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 60, 
            trials = trials, 
           )
print(hopt)

In [None]:
print(hopt)

In [None]:
#{'colsample_bytree': 0.5873216071695012, 'gamma': 5.177112055270119, 'lambda': 7.735558992779168, 'learning_rate': 0.24208100159213855, 'max_depth': 3.0, 'min_child_weight': 2.1053503825354976, 'n_round': 150.0, 'subsample': 0.8655686441807873}

In [None]:
results={}
params = hopt
params['max_depth'] = int(params['max_depth'])
params['n_round'] = int(params['n_round'])
params['objective'] = 'reg:squarederror'
params['eval_metric'] = 'rmse'
params['tree_method'] = 'gpu_hist' #MAKE SURE TO HAVE YOUR GPU ON, otherwise just remove this line

xgtr = xgb.DMatrix(X, y)

clf = xgb.train(params, xgtr, params['n_round'], evals=[(xgtr, 'train')], evals_result=results, verbose_eval = 50)

In [None]:
df['predicted'] = clf.predict(xgtr)
df['residuals'] = df['target'] - df['predicted']

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='target',y='residuals',data=df)

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='target',y='predicted',data=df)

In [None]:
def optimise(params):
    
    print(params)
    p = {'learning_rate': params['learning_rate'],
         'min_data_in_leaf': params['min_data_in_leaf'],
         'num_leaves': params['num_leaves'],
         'max_depth': params['max_depth'], 

         'bagging_freq': params['bagging_freq'], 
         'feature_fraction': params['feature_fraction'],
         'lambda_l2': params['lambda_l2'],
         'verbosity': 0, 
         'objective': 'regression',
         'metric': 'l2', 
         'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0, #MAKE SURE TO HAVE YOUR GPU ON, otherwise just remove this line
         'random_state': 42
        }
    
    score_te=[]
    
    for tr, te in kf.split(X):
        X_tr, X_te = X.loc[tr, :].values, X.loc[te, :].values
        y_tr, y_te = y.loc[tr].values, y.loc[te].values
        d_tr = lgb.Dataset(X_tr,
                           y_tr,
                           feature_name=list(X.columns),
                           free_raw_data = False
                           )
        
        d_val = lgb.Dataset(X_te,
                           label=y_te,
                           feature_name=list(X.columns),
                           free_raw_data = False
                           )   
        
        clf = lgb.train(p, d_tr, params['num_iterations'], verbose_eval = False)
        val_pred = clf.predict(X_te)
        
        score_te.append(mean_squared_error(y_te, val_pred))

    return np.mean(score_te)

param_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
         'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 10, 200, 20)),
         'num_leaves': scope.int(hp.quniform('num_leaves', 10, 50, 2)),
         'max_depth': scope.int(hp.quniform('max_depth', 3, 8, 1)), 

         'bagging_freq':scope.int(hp.uniform('bagging_freq', 0.3*len(features), 1*len(features))), 
         'feature_fraction': hp.uniform('feature_fraction', 0.3, 1),
         'lambda_l2': hp.uniform('lambda_l2', 0.1, 10),
         'num_iterations': scope.int(hp.quniform('num_iterations', 50, 200, 25))
        }

trials = Trials()

hopt = fmin(fn = optimise, 
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 60, 
            trials = trials, 
           )
print(hopt)

In [None]:
print(hopt)
#{'bagging_freq': 95.11853984706136, 'feature_fraction': 0.34260532690897055, 'lambda_l2': 4.15465950834321, 'learning_rate': 0.10558892297096945, 'max_depth': 6.0, 'min_data_in_leaf': 140.0, 'num_iterations': 175.0, 'num_leaves': 28.0, 'verbosity': (0,), 'objective': ('regression',), 'device': ('gpu',), 'gpu_platform_id': (0,), 'gpu_device_id': (0,), 'random_state': 42}

In [None]:
results={}
params = hopt
params['verbosity'] = 0, 
params['objective'] = 'regression'
params['num_iterations']=int(params['num_iterations'])
params['num_leaves']=int(params['num_leaves'])
params['max_depth']=int(params['max_depth'])
params['min_data_in_leaf']=int(params['min_data_in_leaf'])
params['device'] = 'gpu',
params['bagging_freq']=int(params['bagging_freq'])

params['gpu_platform_id'] = 0
params['gpu_device_id'] = 0
params['random_state'] = 42

d_tr = lgb.Dataset(X,
                           y,
                           feature_name=list(X.columns),
                           free_raw_data = False
                           )
clf = lgb.train(params, d_tr, int(params['num_iterations']), verbose_eval = False)

print(mean_squared_error(y, clf.predict(X)))

In [None]:
df['predicted'] = clf.predict(X)
df['residuals'] = df['target'] - df['predicted']

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='target',y='residuals',data=df)

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='target',y='predicted',data=df)

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")

for c in cont_comb:
    test['{} mean {}'.format(c[0], c[1])] = (test[c[0]] + test[c[1]])/2 #Summing
    test['{} by {}'.format(c[0], c[1])] = (test[c[0]] +0.001) / (test[c[1]]+0.001 ) #Dividing

X_tr, X_sub = OHE(df[features], test[features], cat)

pred = clf.predict(X_sub)
submission["target"] = pred
submission.to_csv("submission.csv", index=False)

In [None]:
sns.histplot(submission["target"]);