![](https://www.getdigitalinfluence.com/wp-content/uploads/2016/12/Boosting-vs-Ads-Manager-vs-Power-Editor-776x415.png)

## What is Boosting?

> Boosting (originally called hypothesis boosting) refers to any Ensemble method that
> can combine several weak learners into a strong learner. The general idea of most
> boosting methods is to train predictors sequentially, each trying to correct its predecessor. There are many boosting methods available, but by far the most popular are AdaBoost(short for Adaptive Boosting) and Gradient Boosting. We will talk about both here, but after reading in the data and pre-processing them.

![](https://miro.medium.com/max/694/1*QJZ6W-Pck_W7RlIDwUIN9Q.jpeg)

### First, Importing the required libraries

In [None]:
import numpy as np 
import pandas as pd
import os
import xgboost
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier 

import lightgbm as lgb
from numba import jit 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Reading the dataset

In [None]:
train_df = pd.read_csv('../input/data-science-bowl-2019/train.csv')
test_df = pd.read_csv('../input/data-science-bowl-2019/test.csv')
train_labels_df = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
specs_df = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
sample_submission_df = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')

In [None]:
train_df.head(10)

In [None]:
train_labels_df.head(10)

In [None]:
specs_df.head(10)

##### All preprocessing functions below are from [this](https://www.kaggle.com/gpreda/data-science-bowl-fast-compact-solution) wonderful kernel by Gabriel.



Our focus here is on the different boosting models and see what baseline quadratic kappa scores they give.

In [None]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekofyear'] = df['timestamp'].dt.weekofyear
    df['dayofyear'] = df['timestamp'].dt.dayofyear
    df['quarter'] = df['timestamp'].dt.quarter
    df['is_month_start'] = df['timestamp'].dt.is_month_start    
    return df

In [None]:
def get_object_columns(df, columns):
    df = df.groupby(['installation_id', columns])['event_id'].count().reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [columns], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': ['mean', 'sum', 'min', 'max', 'std', 'skew']})
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = [f'{column}_mean', f'{column}_sum', f'{column}_min', f'{column}_max', f'{column}_std', f'{column}_skew']
    return df

def get_numeric_columns_add(df, agg_column, column):
    df = df.groupby(['installation_id', agg_column]).agg({f'{column}': ['mean', 'sum', 'min', 'max', 'std', 'skew']}).reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [agg_column], values = [col for col in df.columns if col not in ['installation_id', 'type']])
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = list(df.columns)
    return df

In [None]:
def perform_features_engineering(train_df, test_df, train_labels_df):
    print(f'Perform features engineering')
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    comp_train_df = pd.DataFrame({'installation_id': train_df['installation_id'].unique()})
    comp_train_df.set_index('installation_id', inplace = True)
    comp_test_df = pd.DataFrame({'installation_id': test_df['installation_id'].unique()})
    comp_test_df.set_index('installation_id', inplace = True)

    test_df = extract_time_features(test_df)
    train_df = extract_time_features(train_df)

    for i in numerical_columns:
        comp_train_df = comp_train_df.merge(get_numeric_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_numeric_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        comp_train_df = comp_train_df.merge(get_object_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_object_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        for j in numerical_columns:
            comp_train_df = comp_train_df.merge(get_numeric_columns_add(train_df, i, j), left_index = True, right_index = True)
            comp_test_df = comp_test_df.merge(get_numeric_columns_add(test_df, i, j), left_index = True, right_index = True)
    
    
    comp_train_df.reset_index(inplace = True)
    comp_test_df.reset_index(inplace = True)
       
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
 
    labels = train_labels_df[['installation_id', 'title', 'accuracy_group']]
    
    labels['title'] = labels['title'].map(labels_map)
   
    comp_test_df['title'] = test_df.groupby('installation_id').last()['title'].map(labels_map).reset_index(drop = True)
   
    comp_train_df = labels.merge(comp_train_df, on = 'installation_id', how = 'left')
    print('We have {} training rows'.format(comp_train_df.shape[0]))
    
    return comp_train_df, comp_test_df

In [None]:
def qwk3(a1, a2, max_rat=3):
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

## Adaboost

> One way for a new predictor to correct its predecessor is to pay a bit more attention
> to the training instances that the predecessor underfitted. This results in new predictors focusing more and more on the hard cases. This is the technique used by Ada‐Boost. 
For example, to build an AdaBoost classifier, a first base classifier (such as a Decision
Tree) is trained and used to make predictions on the training set. The relative weight
of misclassified training instances is then increased. A second classifier is trained
using the updated weights and again it makes predictions on the training set, weights
are updated, and so on ...

![](http://www.github.com/rakash/images1/blob/master/adaboost.jpg?raw=true)

#### Let us see how decision boundaries are drawn for all the models for adaboost.

![](http://www.github.com/rakash/images1/blob/master/adaboost_db.jpg?raw=true)

> The first classifier(notified by the line) gets many instances wrong, so their weights get boosted. The second classifier therefore does a better job on these instances, and
> so on. The plot on the right represents the same sequence of predictors except that the learning rate is halved (i.e., the misclassified instance weights are boosted half as
> much at every iteration). As you can see, this sequential learning technique has some similarities with Gradient Descent, except that instead of tweaking a single predictor’s
> parameters to minimize a cost function, AdaBoost adds predictors to the ensemble,gradually making it better.
> 
> Once all predictors are trained, the ensemble makes predictions very much like bagging or pasting, except that predictors have different weights depending on their overall accuracy on the weighted training set.

In [None]:
ada_train_df, ada_test_df = perform_features_engineering(train_df, test_df, train_labels_df)

In [None]:
null_columns = ada_test_df.columns[ada_test_df.isnull().any()]
ada_test_df[null_columns].isnull().sum()

In [None]:
ada_test_df['game_time_std'] = ada_test_df['game_time_std'].fillna(0)
ada_test_df['game_time_skew'] = ada_test_df['game_time_skew'].fillna(0)

#### Below is the model function. As you can see, for ada boost we will be using a simple decision tree as the base estimator.


#### Like Random Forest, AdaBoost makes predictions by applying multiple decision trees to every sample and combining the predictions made by individual trees. However, rather than taking the average of the predictions made by each decision tree in the forest (or majority in the case of classification), in the AdaBoost algorithm, every decision tree contributes a varying amount to the final prediction.

In [None]:
def adaboost_it(ada_train_df, ada_test_df):
    print("Ada-Boosting...")
    t_splits = 5
    k_scores = []
    kf = KFold(n_splits = t_splits)
    features = [i for i in ada_train_df.columns if i not in ['accuracy_group', 'installation_id']]
    target = 'accuracy_group'
    oof_pred = np.zeros((len(ada_train_df), 4))
    y_pred = np.zeros((len(ada_test_df), 4))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(ada_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = ada_train_df[features].iloc[tr_ind], ada_train_df[features].iloc[val_ind]
        y_train, y_val = ada_train_df[target][tr_ind], ada_train_df[target][val_ind]
               
        ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200,algorithm="SAMME.R", learning_rate=0.5)
        ada_clf.fit(x_train, y_train)
        oof_pred[val_ind] = ada_clf.predict_proba(x_val)
      
        y_pred += ada_clf.predict_proba(ada_test_df[features]) / t_splits
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(ada_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return y_pred

In [None]:
y_pred = adaboost_it(ada_train_df, ada_test_df)

In [None]:
ada_test_df = ada_test_df.reset_index()
ada_test_df = ada_test_df[['installation_id']]
ada_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
ada_sample_submission_df = sample_submission_df.merge(ada_test_df, on = 'installation_id')
ada_sample_submission_df.to_csv('ada_boost_submission.csv', index = False)

## XGBoost

![](https://miro.medium.com/max/583/1*FLshv-wVDfu-i54OqvZdHg.png)

> Another very popular Boosting algorithm is Gradient Boosting. Just like AdaBoost, Gradient Boosting works by sequentially adding predictors to an ensemble, each one correcting its predecessor. However, instead of tweaking the instance weights at every iteration like AdaBoost does, this method tries to fit the new predictor to the residual errors made by the previous predictor.
> 
> 
> Think of XGBoost as gradient boosting on ‘steroids’ (well it is called ‘Extreme Gradient Boosting’ for a reason!). It is a perfect combination of software and hardware optimization techniques to yield superior results using less computing resources in the shortest amount of time.

In [None]:
xgb_train_df, xgb_test_df = perform_features_engineering(train_df, test_df, train_labels_df)

In [None]:
features = [i for i in xgb_train_df.columns if i not in ['accuracy_group', 'installation_id']]
target = 'accuracy_group'

In [None]:
x_train  = xgb_train_df[features]
y_train = xgb_train_df[target]

Grid search is very time consuming and therefore i have commented it for now.

In [None]:
#from sklearn.model_selection import GridSearchCV
#model = xgboost.XGBClassifier()

#param_dist = {"max_depth": [10,30,50],"min_child_weight" : [1,3,6],
 #             "n_estimators": [200],
  #            "learning_rate": [0.05, 0.1,0.16],}

#grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, verbose=10, n_jobs=-1)
#grid_search.fit(x_train, y_train)
#grid_search.best_estimator_

In [None]:
def xgb(xgb_train_df, xgb_test_df):
    print("XG-Boosting...")
    t_splits = 5
    k_scores = []
    kf = KFold(n_splits = t_splits)
    features = [i for i in xgb_train_df.columns if i not in ['accuracy_group', 'installation_id']]
    target = 'accuracy_group'
    oof_pred = np.zeros((len(xgb_train_df), 4))
    y_pred = np.zeros((len(xgb_test_df), 4))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(xgb_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = xgb_train_df[features].iloc[tr_ind], xgb_train_df[features].iloc[val_ind]
        y_train, y_val = xgb_train_df[target][tr_ind], xgb_train_df[target][val_ind]
        
        xgb_clf = xgboost.XGBClassifier()
        xgb_clf.fit(x_train, y_train)
        oof_pred[val_ind] = xgb_clf.predict_proba(x_val)
      
        y_pred += xgb_clf.predict_proba(xgb_test_df[features]) / t_splits
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(xgb_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return y_pred

In [None]:
y_pred = xgb(xgb_train_df, xgb_test_df)

In [None]:
xgb_test_df = xgb_test_df.reset_index()
xgb_test_df = xgb_test_df[['installation_id']]
xgb_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
xgb_sample_submission_df = sample_submission_df.merge(xgb_test_df, on = 'installation_id')
xgb_sample_submission_df.to_csv('xgb_submission.csv', index = False)

In [None]:
xgb_sample_submission_df = xgb_sample_submission_df.drop('accuracy_group_x', axis=1)
xgb_sample_submission_df.columns = ['installation_id', 'accuracy_group']

In [None]:
xgb_sample_submission_df.to_csv('xgb_submission.csv', index = False)

## Catboost

> Catboost yields state-of-the-art results without extensive data training typically required by other machine learning methods, and it Provides powerful out-of-the-box support for the more descriptive data formats that accompany many business problems.

> Major advantage is it handles categorical variables automatically, that is why the name 'CAT-boost'


You can know more about it [here](https://www.youtube.com/watch?time_continue=2&v=s8Q_orF4tcI)

In [None]:
cat_train_df, cat_test_df = perform_features_engineering(train_df, test_df, train_labels_df)

In [None]:
xc_train  = cat_train_df[features]
yc_train = cat_train_df[target]

In [None]:
import catboost as cb
def cat(cat_train_df, cat_test_df):
    print("Meeowwww...")
    t_splits = 3
    k_scores = []
    kf = KFold(n_splits = t_splits)
    features = [i for i in cat_train_df.columns if i not in ['accuracy_group', 'installation_id']]
    target = 'accuracy_group'
    oof_pred = np.zeros((len(cat_train_df), 4))
    y_pred = np.zeros((len(cat_test_df), 4))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(cat_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = cat_train_df[features].iloc[tr_ind], cat_train_df[features].iloc[val_ind]
        y_train, y_val = cat_train_df[target][tr_ind], cat_train_df[target][val_ind]
        
        cat_clf = cb.CatBoostClassifier(depth=10, iterations= 200, l2_leaf_reg= 9, learning_rate= 0.15)
        cat_clf.fit(xc_train, yc_train)
        oof_pred[val_ind] = cat_clf.predict_proba(x_val)
      
        y_pred += cat_clf.predict_proba(cat_test_df[features]) / t_splits
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(cat_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return y_pred

In [None]:
y_pred_cat = cat(cat_train_df, cat_test_df)

In [None]:
cat_test_df = cat_test_df.reset_index()
cat_test_df = cat_test_df[['installation_id']]
cat_test_df['accuracy_group'] = y_pred_cat.argmax(axis = 1)
cat_sample_submission_df = sample_submission_df.merge(cat_test_df, on = 'installation_id')
cat_sample_submission_df.to_csv('submission.csv', index = False)

In [None]:
cat_sample_submission_df = cat_sample_submission_df.drop('accuracy_group_x', axis=1)
cat_sample_submission_df.columns = ['installation_id', 'accuracy_group']

In [None]:
cat_sample_submission_df.to_csv('submission.csv', index = False)

## LightGBM

> It is based on decision tree algorithms, it splits the tree leaf wise with the best fit whereas other boosting algorithms split the tree depth wise or level wise rather than leaf-wise. So when growing on the same leaf in Light GBM, the leaf-wise algorithm can reduce more loss than the level-wise algorithm and hence results in much better accuracy which can rarely be achieved by any of the existing boosting algorithms. Also, it is surprisingly very fast, hence the word ‘Light’. 

### THIS IS HOW IT WORKS IN XGBOOST

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/06/11194110/leaf.png)

### HOW IT WORKS IN LIGHTGBM

![](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/06/11194227/depth.png)

In [None]:
lgb_train_df, lgb_test_df = perform_features_engineering(train_df, test_df, train_labels_df)

In [None]:
xl_train  = lgb_train_df[features]
yl_train = lgb_train_df[target]

In [None]:
import lightgbm as lgb

def lgbc(lgb_train_df, lgb_test_df):
    print("Meeowwww...")
    t_splits = 3
    k_scores = []
    kf = KFold(n_splits = t_splits)
    features = [i for i in lgb_train_df.columns if i not in ['accuracy_group', 'installation_id']]
    target = 'accuracy_group'
    oof_pred = np.zeros((len(lgb_train_df), 4))
    y_pred = np.zeros((len(lgb_test_df), 4))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(lgb_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = lgb_train_df[features].iloc[tr_ind], lgb_train_df[features].iloc[val_ind]
        y_train, y_val = lgb_train_df[target][tr_ind], lgb_train_df[target][val_ind]
        
        lg = lgb.LGBMClassifier(silent=False)
        lg.fit(xl_train, yl_train)
        oof_pred[val_ind] = lg.predict_proba(x_val)
      
        y_pred += lg.predict_proba(lgb_test_df[features]) / t_splits
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(lgb_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return y_pred

In [None]:
y_pred_lgb = lgbc(lgb_train_df, lgb_test_df)

In [None]:
lgb_test_df = lgb_test_df.reset_index()
lgb_test_df = lgb_test_df[['installation_id']]
lgb_test_df['accuracy_group'] = y_pred_lgb.argmax(axis = 1)
lgb_sample_submission_df = sample_submission_df.merge(lgb_test_df, on = 'installation_id')
lgb_sample_submission_df.to_csv('lgb_submission.csv', index = False)

In [None]:
lgb_sample_submission_df = lgb_sample_submission_df.drop('accuracy_group_x', axis=1)
lgb_sample_submission_df.columns = ['installation_id', 'accuracy_group']

In [None]:
data = [['ada', 0.42], ['xgb', 0.44], ['cat', 0.65], ['lgb', 0.62]]

df = pd.DataFrame(data, columns = ['Model', 'Validation Kappa Score']) 

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=df['Model'], y=df['Validation Kappa Score'], marker_color='#FFD700'))
fig.show()