In [None]:
# Import kaggle.json

# This code is only needed on Google Colab
#from google.colab import files
#files.upload()

In [None]:
#!pip install kaggle

# This code is only needed on Google Colab
#!mkdir ~/.kaggle
#!cp /content/kaggle.json ~/.kaggle/kaggle.json
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle competitions download tabular-playground-series-feb-2021
#!unzip \*.zip

# February 2021 - Tabular Playground Series
Jaime Avendano  
Twitter: @JaimeAAvendano

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from plotnine import ggplot, geom_point, geom_density, aes

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
# This code is only needed on Google Colab
#test = pd.read_csv('test.csv')
#train = pd.read_csv('train.csv')
#submission = pd.read_csv('sample_submission.csv')

# This code is only needed in Kaggle
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')


print(train.shape)
print(test.shape)

# Introduction
This is my first month going through one of the tabular playground data sets. My two main goals are:

*   Handling categorical columns via OneHotEncoding and LabelEncoding
*   Comparing the speed and results of various tree models


In [None]:
cat_columns = train.select_dtypes(include=['object']).columns.values
num_columns = train.select_dtypes(include=[np.number]).columns.values
#for col in cat_columns:
#  train[col] = train[col].astype('category')

In [None]:
train_small = train.sample(n=10000)

In [None]:
sns.pairplot(train_small, vars = num_columns[10:])

In [None]:
corr = train_small.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)

In [None]:
plt.figure(figsize = (15, 15))
sns.heatmap(corr, mask=mask, cmap=cmap, linewidths=0.5, cbar_kws={"shrink": 0.5},
            center=0, annot=True, square=True)

In [None]:
(ggplot(train_small, aes(x='cont1', y='target'))
+ geom_point())

In [None]:
(ggplot(train_small, aes(x='target', color='cat1'))
+ geom_density())

In [None]:
train.iloc[:, 1:11].apply(pd.Series.value_counts)

##Submissions
0. Baseline [mean value] (Kaggle = 0.88498)  

1. OneHotEncoder for all cat columns (Kaggle = 0.86476)   
  DecisionTreeRegressor(max_leaf_nodes=150)


2. LabelEncoder for cat0-cat2 (Kaggle = 0.86476)  
  Reduce values in cat6-cat9  
  OneHotEncoder for cat3-cat9  
  DecisionTreeRegressor(max_leaf_nodes=150)

3. V1 + RandomForest(max_leaf_nodes=600) (Kaggle = 0.85604)
  
4. GradientBoosting (Kaggle = 0.84856)

5. GradientBoosting v2 (Kaggle = 0.84863)
  + Reduced values 
  + Drop a OH column for each.


In [None]:
train.target.mean()

# Submission 1
- OneHotEncoder for all cat columns
- DecisionTreeRegressor(max_leaf_nodes=150, random_state=0)


In [None]:
v1_cf = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_columns)
], remainder = "drop", sparse_threshold=0)

v1_cf.fit(train.drop('target', axis=1))

In [None]:
v1_train = pd.DataFrame(v1_cf.transform(train), index=train.index, columns=v1_cf.get_feature_names())
v1_train = pd.concat([train.drop(columns=cat_columns), v1_train], axis=1)

In [None]:
v1_train.head()

In [None]:
v1_train_X, v1_val_X, v1_train_y, v1_val_y = train_test_split(v1_train.drop(['id', 'target'], axis=1), v1_train.target,random_state = 0)

In [None]:
def get_score(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    model_score = mean_squared_error(val_y, preds_val)

    return(model_score)

In [None]:
for max_leaf_nodes in [50, 100, 150, 200, 250]:
    my_score = get_score(max_leaf_nodes, v1_train_X, v1_val_X, v1_train_y, v1_val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t Score:  {my_score}")

In [None]:
v1_test = pd.DataFrame(v1_cf.transform(test), index=test.index, columns=v1_cf.get_feature_names())
v1_test = pd.concat([test.drop(columns=cat_columns), v1_test], axis=1)

In [None]:
v1_model = DecisionTreeRegressor(max_leaf_nodes=150, random_state=0)
v1_model.fit(v1_train.drop(['id', 'target'], axis=1), v1_train.target)

In [None]:
v1_preds = v1_model.predict(v1_test.drop(['id'], axis=1))

# Submission 2
- LabelEncoder for cat0-cat2 (just this alone was the same as OH)  
- OneHotEncoder for all other cat columns  
- Reduce values in cat6-cat9
- DecisionTreeRegressor(max_leaf_nodes=150, random_state=0)

In [None]:
# Wanted something like R fct_lump, but didn't find it.
def v2_reduce_cats(v2_df, cat_column, cat_values):
    v2_df[cat_column] = pd.Categorical(v2_df[cat_column], categories=cat_values).fillna('other')


In [None]:
v2_train = train.copy()
v2_reduce_cats(v2_train, 'cat6', ['A', 'B', 'other'])
v2_reduce_cats(v2_train, 'cat7', ['B', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v2_train, 'cat8', ['A', 'C', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v2_train, 'cat9', ['A', 'B', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'other'])

In [None]:
# LabelEncoder wasn't working with the get_features_names() function, so I'll do it manually
v2_label_enc = LabelEncoder()
v2_cf = ColumnTransformer([                       
    ("catOneHot", OneHotEncoder(handle_unknown='ignore'), cat_columns[3:]),
], remainder = "drop", sparse_threshold=0)

v2_label_enc.fit(train.cat0)
v2_cf.fit(v2_train.drop('target', axis=1))

In [None]:
v2_train = pd.DataFrame(v2_cf.transform(v2_train), index=train.index, columns=v2_cf.get_feature_names())
v2_train['cat0'] = v2_label_enc.transform(train.cat0)
v2_train['cat1'] = v2_label_enc.transform(train.cat1)
v2_train['cat2'] = v2_label_enc.transform(train.cat2)
v2_train = pd.concat([train.drop(columns=cat_columns), v2_train], axis=1)

In [None]:
v2_train_X, v2_val_X, v2_train_y, v2_val_y = train_test_split(v2_train.drop(['id', 'target'], axis=1), v2_train.target,random_state = 0)

In [None]:
for max_leaf_nodes in [50, 100, 150, 200, 250]:
    my_score = get_score(max_leaf_nodes, v2_train_X, v2_val_X, v2_train_y, v2_val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t Score:  {my_score}")

In [None]:
v2_model = DecisionTreeRegressor(max_leaf_nodes=150, random_state=0)
v2_model.fit(v2_train.drop(['id', 'target'], axis=1), v2_train.target)

In [None]:
v2_test = test.copy()
v2_reduce_cats(v2_test, 'cat6', ['A', 'B', 'other'])
v2_reduce_cats(v2_test, 'cat7', ['B', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v2_test, 'cat8', ['A', 'C', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v2_test, 'cat9', ['A', 'B', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'other'])
v2_test = pd.DataFrame(v2_cf.transform(v2_test), index=test.index, columns=v2_cf.get_feature_names())
v2_test['cat0'] = v2_label_enc.transform(test.cat0)
v2_test['cat1'] = v2_label_enc.transform(test.cat1)
v2_test['cat2'] = v2_label_enc.transform(test.cat2)
v2_test = pd.concat([test.drop(columns=cat_columns), v2_test], axis=1)

In [None]:
v2_preds = v2_model.predict(v2_test.drop(['id'], axis=1))

# Submission 3
- OneHotEncoder for all cat columns
- RandomForestRegressor(max_leaf_nodes=600, random_state=0)

In [None]:
def v3_get_score(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators = 10, 
                                  n_jobs = -1,
                                  max_leaf_nodes=max_leaf_nodes, 
                                  verbose = True,
                                  random_state=0)
    
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    model_score = mean_squared_error(val_y, preds_val)

    return(model_score)

In [None]:
for max_leaf_nodes in [500, 600, 700, 800, 900]:
    my_score = v3_get_score(max_leaf_nodes, v1_train_X, v1_val_X, v1_train_y, v1_val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t Score:  {my_score}")

In [None]:
v3_model = RandomForestRegressor(n_estimators = 100, n_jobs = -1, max_leaf_nodes=600, random_state=0)
v3_model.fit(v1_train.drop(['id', 'target'], axis=1), v1_train.target)

In [None]:
v3_preds = v3_model.predict(v1_test.drop(['id'], axis=1))

#Submission 4
- OneHotEncoder for all cat columns
- GradientBoostingRegressor(max_depth = 5, random_state=0)

In [None]:
def v4_get_score(max_depth, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(verbose = 1,
                                      max_depth = max_depth,
                                      n_iter_no_change = 5,
                                      random_state=0)
    
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    model_score = mean_squared_error(val_y, preds_val)

    return(model_score)

In [None]:
#for max_depth in [2, 3, 4, 5]:
for max_depth in [3, 4, 5]:
    my_score = v4_get_score(max_depth, v1_train_X, v1_val_X, v1_train_y, v1_val_y)
    print(f"Max depth: {max_depth}  \t Score:  {my_score}")

In [None]:
v4_model = GradientBoostingRegressor(max_depth = 5,
                                     n_iter_no_change = 5,
                                     random_state=0)
v4_model.fit(v1_train.drop(['id', 'target'], axis=1), v1_train.target)

In [None]:
v4_preds = v4_model.predict(v1_test.drop(['id'], axis=1))

#Submission 5
- LabelEncoder for cat0-cat2
- OneHotEncoder for all other cat columns
- Reduce values in cat6-cat9
- Drop a column for OneHot columns
- GradientBoostingRegressor(max_depth = 5, random_state=0)

In [None]:
def v5_get_score(max_depth, train_X, val_X, train_y, val_y):
    model = GradientBoostingRegressor(verbose = 1,
                                      max_depth = max_depth,
                                      n_iter_no_change = 5,
                                      random_state=0)
    
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    model_score = mean_squared_error(val_y, preds_val)

    return(model_score)

In [None]:
v5_train = v2_train.copy()
v5_train.head()

In [None]:
# Drop:
# cat3 == B
# cat4 == C
# cat5 == A
# cat6, cat7, cat8, cat9 == other
v5_train = v5_train.drop(columns=['catOneHot__x0_B', 'catOneHot__x1_C', 'catOneHot__x2_A', 'catOneHot__x3_other', 'catOneHot__x4_other', 'catOneHot__x5_other', 'catOneHot__x6_other'])

In [None]:
v5_train_X, v5_val_X, v5_train_y, v5_val_y = train_test_split(v5_train.drop(['id', 'target'], axis=1), v5_train.target,random_state = 0)

In [None]:
for max_depth in [2, 3, 4, 5]:
#for max_depth in [3]:
    my_score = v5_get_score(max_depth, v5_train_X, v5_val_X, v5_train_y, v5_val_y)
    print(f"Max depth: {max_depth}  \t Score:  {my_score}")

In [None]:
v5_model = GradientBoostingRegressor(max_depth = 5,
                                     n_iter_no_change = 5,
                                     random_state=0)
v5_model.fit(v5_train.drop(['id', 'target'], axis=1), v5_train.target)

In [None]:
v5_test = test.copy()
v2_reduce_cats(v5_test, 'cat6', ['A', 'B', 'other'])
v2_reduce_cats(v5_test, 'cat7', ['B', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v5_test, 'cat8', ['A', 'C', 'D', 'E', 'G', 'other'])
v2_reduce_cats(v5_test, 'cat9', ['A', 'B', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'other'])
v5_test = pd.DataFrame(v2_cf.transform(v5_test), index=test.index, columns=v2_cf.get_feature_names())
v5_test['cat0'] = v2_label_enc.transform(test.cat0)
v5_test['cat1'] = v2_label_enc.transform(test.cat1)
v5_test['cat2'] = v2_label_enc.transform(test.cat2)
v5_test = pd.concat([test.drop(columns=cat_columns), v5_test], axis=1)
v5_test = v5_test.drop(columns=['catOneHot__x0_B', 'catOneHot__x1_C', 'catOneHot__x2_A', 'catOneHot__x3_other', 'catOneHot__x4_other', 'catOneHot__x5_other', 'catOneHot__x6_other'])

In [None]:
v5_preds = v5_model.predict(v5_test.drop(['id'], axis=1))

#Sumission code

In [None]:
submission['target'] = v5_preds
#submission.to_csv('competitions/tabular-playground-series-feb-2021/submission.csv', index=False)
submission.to_csv('submission.csv', index=False)

In [None]:
(ggplot(submission, aes(x='target'))
+ geom_density())

In [None]:
#!kaggle competitions submit -c tabular-playground-series-feb-2021 -f competitions/tabular-playground-series-feb-2021/submission.csv -m 'Second one'
!kaggle competitions submit -c tabular-playground-series-feb-2021 -f submission.csv -m 'GradientBoostingRegressor v2'

# Lessons learned
This month I just focused on simple tree models. I got respectable results, and was able to see the benefits of OneHot encoding. 

Currently, I was using my own function to test different values. Next step is to use GridSearchCV to go through various other hyperparameters, which is a more robust long term solution.

I was also able to work with the Kaggle API and Google Colab. I really enjoyed working out of Google Colab. It has a great UI and the code autocompletion and help info made it really easy to work with.

The Kaggle API took a minute to figure out, but made submitting very easy once I got the setup right.
