# GOAL

<br/>
Experimenting with Random Forests.

In [1]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display


import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline

In [2]:
#Seaborn Styles
sns.set(rc={'figure.figsize':(10,7)}) # figure sizes
sns.set_palette('colorblind') ## Always colorblind

In [3]:
PATH = 'data/'

## DATA

TRAIN

In [4]:
df_train = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [5]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


TEST

In [6]:
test = pd.read_csv(f'{PATH}test.csv', low_memory=False)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [8]:
cat_columns = [col for col in df_train.columns if df_train[col].dtype == 'O']
cat_columns

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [9]:
for col in cat_columns:
    print(f'{col} has {df_train[col].nunique()} categories')

X0 has 47 categories
X1 has 27 categories
X2 has 44 categories
X3 has 7 categories
X4 has 4 categories
X5 has 29 categories
X6 has 12 categories
X8 has 25 categories


In [10]:
id_cols = ["ID"]
y = df_train.y
exclude = id_cols + ['y'] + cat_columns


num_columns = df_train.drop(exclude, axis = 1).columns

In [11]:
# Checking for missing data 
np.sum(df_train.isnull().sum()/len(df_train) > 0) # No columns with missing data

0

<br/>
<br/>
<br/>
<br/>



## ML 

<br/>
<br/>
<br/>


### ROUND 1 
Trained and evaluated on TRAINING SET

In [12]:
X = df_train.drop('y', axis = 1)
y = df_train.y

In [13]:
X.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from category_encoders import TargetEncoder

Due to the high cardinality of the categorical columns, we will try the TargetEncoder to retain dimensionality of the dataset

In [15]:
target_encoder = TargetEncoder(cols= cat_columns)

All the other numerical columns are binary

In [16]:
np.sum(df_train[num_columns].nunique() > 2)

0

In [17]:
target_encoder.fit_transform(X.drop('ID', axis = 1), y)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,99.491872,101.412574,104.194580,102.507477,100.658293,100.669318,101.165245,97.746933,0,0,...,0,0,1,0,0,0,0,0,0,0
1,99.491872,93.723226,95.754685,100.033190,100.658293,100.669318,98.945502,97.746933,0,0,...,1,0,0,0,0,0,0,0,0,0
2,78.025543,95.764808,83.369927,101.959269,100.658293,84.418384,101.165245,98.577238,0,0,...,0,0,0,0,0,0,1,0,0,0
3,78.025543,93.723226,83.369927,96.564507,100.658293,84.418384,98.945502,104.976311,0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.025543,101.412574,83.369927,96.564507,100.658293,100.669318,101.346464,102.194215,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,112.552235,101.868462,103.732453,101.959269,100.658293,98.672500,101.346464,102.551197,0,0,...,1,0,0,0,0,0,0,0,0,0
4205,112.072707,96.316707,109.617931,105.114414,100.658293,98.672500,103.134842,103.446154,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,112.552235,101.412574,108.393464,102.507477,100.658293,98.672500,100.306286,104.976311,0,0,...,0,0,1,0,0,0,0,0,0,0
4207,93.221343,100.095378,98.080370,96.564507,100.658293,98.672500,98.945502,101.156723,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Transform 
X1 = target_encoder.fit_transform(X.drop('ID', axis = 1), y)

# fit 
m1 = RandomForestRegressor(n_jobs=-1)
m1.fit(X1, y)

RandomForestRegressor(n_jobs=-1)

In [19]:
print(f'Coefficient of determination r^2 : {m1.score(X1, y)}')

Coefficient of determination r^2 : 0.91040931010287


<br/>
<br/>
<br/>


### ROUND 2   
Evaluated on validation set  (separate from test set)  
VALIDATION SET created from train set

In [20]:
# def split_vals(a, n): return a[:n].copy() , a[n:].copy()

# n_valid = int(len(X) * 0.2)

# n_train = len(X) - n_valid
# X_train, X_valid = split_vals(X, n_train)
# y_train, y_valid = split_vals(y, n_train)

# X_train.shape, y_train.shape, X_valid.shape

The above method is splitting the train/valid data assuming a certain order in which the data was provided. When we split the data randomly, the r^2 is decreasing.   
Maintaining an order might make sense when you have a timestamp column, but otherwise the split should be random. 

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size = 0.3)

In [22]:
# Transform 
X2_train = target_encoder.fit_transform(X_train.drop('ID', axis = 1), y_train)

# fit 
m2 = RandomForestRegressor(n_jobs=-1)
%time m2.fit(X2_train, y_train)

CPU times: user 8.46 s, sys: 52.6 ms, total: 8.51 s
Wall time: 1.29 s


RandomForestRegressor(n_jobs=-1)

In [23]:
# VALIDATION SET

X_valid_transformed = target_encoder.fit_transform(X_valid.drop('ID', axis = 1), y_valid)
print(f'Coefficient of determination r^2 : {m2.score(X_valid_transformed, y_valid)}')

Coefficient of determination r^2 : 0.3541908776298298


Score is fluctuating a lot depending on train test split.

In [24]:
#TEST SET 
X_test_transformed = target_encoder.transform(test.drop("ID", axis = 1))
pred = m2.predict(X_test_transformed)

In [25]:
final_submission = test.copy()
final_submission["predicted_time_in_seconds"] = pred
final_submission

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,predicted_time_in_seconds
0,1,az,v,n,f,d,t,a,w,0,...,0,0,1,0,0,0,0,0,0,77.811500
1,2,t,b,ai,a,d,b,g,y,0,...,0,1,0,0,0,0,0,0,0,96.181222
2,3,az,v,as,f,d,a,j,j,0,...,0,0,1,0,0,0,0,0,0,77.804000
3,4,az,l,n,f,d,z,l,n,0,...,0,0,1,0,0,0,0,0,0,81.080000
4,5,w,s,as,c,d,y,i,m,0,...,0,0,0,0,0,0,0,0,0,116.710250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,107.120433
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,1,0,0,0,0,0,0,0,0,100.356552
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,92.765954
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,1,0,0,0,0,0,0,0,119.078600


<br/>
<br/>
<br/>


### SINGLE TREE

In [26]:
m_single = RandomForestRegressor(n_estimators=1, bootstrap = False, n_jobs= -1)
m_single.fit(X2_train, y_train)
print(f'Coefficient of determination r^2  for TRAINING DATASET: {m_single.score(X2_train, y_train)}')

print(f'Coefficient of determination r^2  for TRAINING DATASET: {m_single.score(X_valid_transformed, y_valid)}')

Coefficient of determination r^2  for TRAINING DATASET: 0.9850620925844631
Coefficient of determination r^2  for TRAINING DATASET: -0.13473207882553373


Overfits and generalizes very very poorly.

In [27]:
from sklearn import tree
from sklearn.tree import plot_tree

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))
_ = tree.plot_tree(m_single.estimators_[0], feature_names= X2_train.columns, filled=True)

<br/>
<br/>
<br/>


### ROUND 3
Cross validation instead of single train/valid split.  
Evaluated on validation set  
VALIDATION SET created from train set