# Decision Trees Model

## Loading the data

In [70]:
import pandas as pd
import numpy as np

In [71]:
data = pd.read_csv('../data/clean/Global_YouTube_Statistics1.csv')
data.head()

Unnamed: 0,Youtuber,category,Country,subscribers,video views,uploads,video_views_for_the_last_30_days,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days
0,T-Series,Music,India,245000000,228000000000.0,20082,2258000000.0,564600.0,9000000.0,6800000.0,108400000.0,2000000.0
1,YouTube Movies,Film & Animation,United States,170000000,0.0,1,12.0,0.0,0.05,0.04,0.58,100000.0
2,MrBeast,Entertainment,United States,166000000,28368840000.0,741,1348000000.0,337000.0,5400000.0,4000000.0,64700000.0,8000000.0
3,Cocomelon - Nursery Rhymes,Education,United States,162000000,164000000000.0,966,1975000000.0,493800.0,7900000.0,5900000.0,94800000.0,1000000.0
4,SET India,Shows,India,159000000,148000000000.0,116536,1824000000.0,455900.0,7300000.0,5500000.0,87500000.0,1000000.0


## Selecting y

In [72]:
y = data['subscribers']
X = data.drop(columns=['subscribers', 'Youtuber', 'category', 'Country'], axis=1) # I drop 'Youtuber', 'category', 'Country' because they have a high cardinality. It does not make sense to HotEncode them.

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)


X_train_df = pd.DataFrame(X_train, columns=X.columns)


X_test_df = pd.DataFrame(X_test, columns=X.columns)


In [74]:
X_train_df.dtypes

video views                         float64
uploads                               int64
video_views_for_the_last_30_days    float64
lowest_monthly_earnings             float64
highest_monthly_earnings            float64
lowest_yearly_earnings              float64
highest_yearly_earnings             float64
subscribers_for_last_30_days        float64
dtype: object

In [75]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import dtreeviz
import graphviz
import graphviz.backend as be
from IPython.display import Image, display_svg, SVG
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

In [76]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.85
test data R2 was: 0.42


## Crossvalidation

In [77]:
from sklearn.model_selection import cross_validate

# We always do the CV on the TRAIN set. The option cv is the number of folds K
results = cross_validate(regr, X_train, y_train, cv = 5)

In [78]:
results

{'fit_time': array([0.00997257, 0.01496363, 0.00797844, 0.00996995, 0.00950432]),
 'score_time': array([0.00299191, 0.00298858, 0.00199652, 0.00399041, 0.00297928]),
 'test_score': array([0.61014278, 0.03716965, 0.19845164, 0.37517528, 0.21253667])}

In [79]:
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.61014278 0.03716965 0.19845164 0.37517528 0.21253667]
The average R2 over the folds is: 0.29
The standard deviation of R2 over the folds is: 0.19


In [80]:
regr = DecisionTreeRegressor(max_depth=15,
                             criterion = 'squared_error',
                             min_samples_split=20,
                             min_samples_leaf = 10,
                             max_features = 10)

In [82]:
results = cross_validate(regr, X_train, y_train, cv = 5)
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.35461944 0.48329267 0.11846637 0.35295232 0.3012947 ]
The average R2 over the folds is: 0.32
The standard deviation of R2 over the folds is: 0.12


## GridSearchCV

In [83]:
from sklearn.model_selection import GridSearchCV

max_depth_choices = [3,5] # A list of the possible values of max_depth to try
criterion_choices = ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [2,10] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [2,10] # A list of the possible values of min_samples_leaf to try

In [84]:
# Create the  grid
# this is a dictionary from hyperparameters to potential values
# the keys in this dictionary have to match the names of the hyperparameters in the documentation of the model
grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices}

In [85]:
# Instantiate the grid search model object

# estimator -> model to optimize
model = DecisionTreeRegressor()
# param_grid -> state the dictionary of parameters to optimize
# cv = 5 -> number of cross validation folds.
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5)

In [86]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [87]:
# and the winner is...
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 10}

In [88]:
# in grid search you are more likely to get really good results in your training set, even with CV
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is 0.46


## RandomizedSearchCV

In [98]:
from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= np.random.randint(low=1, high=len(X.columns), size=7) # A random integer between 1 and the number of columns
criterion_choices =  ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = np.random.randint(low=2, high=20, size=7) # A random integer between 1 and the number of columns
min_samples_leaf_choices = np.random.randint(low=2, high=20, size=7) # A random integer between 1 and the number of columns
max_features_choices = np.random.randint(low=1, high=len(X.columns), size=7) # A random integer between 1 and the number of columns

random_grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices}

In [99]:
#trying grid search

model = DecisionTreeRegressor()
grid_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, cv = 5, random_state=12)
grid_search.fit(X_train, y_train)

In [100]:
grid_search.best_params_

{'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 5,
 'max_depth': 3,
 'criterion': 'absolute_error'}

In [101]:
print("The best R2 according to the random search is {:.2f}".format(grid_search.best_score_))

The best R2 according to the random search is 0.34


In [102]:
#and now more realistic
model = DecisionTreeRegressor()
# n_iter is how many random combinations of hyperparameters will test use the computer.
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter=25, cv = 6, n_jobs = 6) # n_jobs = number_processors - 2

In [103]:
random_search.fit(X_train,y_train)

In [104]:
random_search.best_params_

{'min_samples_split': 17,
 'min_samples_leaf': 8,
 'max_features': 7,
 'max_depth': 6,
 'criterion': 'absolute_error'}

In [105]:
print("The best R2 according to the random search is {:.2f}".format(random_search.best_score_))

The best R2 according to the random search is 0.42


In [106]:
random_search.cv_results_['mean_test_score'].mean(), random_search.cv_results_['mean_test_score'].std()

(0.2105954840462073, 0.13664911870596516)

The score is too low to consider this model good enough. I will check Random_Forest.