# Decision Trees Model

In [2]:
import pandas as pd
import numpy as np

## Loading the data

In [3]:
data = pd.read_csv('../data/clean/Global_YouTube_Statistics1.csv')
data.head()

Unnamed: 0,Youtuber,category,Country,subscribers,video views,uploads,video_views_for_the_last_30_days,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days
0,T-Series,Music,India,245000000,228000000000.0,20082,2258000000.0,564600.0,9000000.0,6800000.0,108400000.0,2000000.0
1,YouTube Movies,Film & Animation,United States,170000000,0.0,1,12.0,0.0,0.05,0.04,0.58,100000.0
2,MrBeast,Entertainment,United States,166000000,28368840000.0,741,1348000000.0,337000.0,5400000.0,4000000.0,64700000.0,8000000.0
3,Cocomelon - Nursery Rhymes,Education,United States,162000000,164000000000.0,966,1975000000.0,493800.0,7900000.0,5900000.0,94800000.0,1000000.0
4,SET India,Shows,India,159000000,148000000000.0,116536,1824000000.0,455900.0,7300000.0,5500000.0,87500000.0,1000000.0


## Selecting y

In [4]:
y = data['subscribers']
X = data.drop(columns=['subscribers', 'Youtuber', 'category', 'Country'], axis=1) # I drop 'Youtuber', 'category', 'Country' because they have a high cardinality. It does not make sense to HotEncode them.

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)


X_train_df = pd.DataFrame(X_train, columns=X.columns)


X_test_df = pd.DataFrame(X_test, columns=X.columns)


## Comparing several models with CV

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [24]:
model1 = DecisionTreeRegressor()
model2 = LinearRegression() # y = b0 + b1 * x1 + b2 * x2 +.....
model3 = KNeighborsRegressor() # weights = "uniform","distance"

# data should really be scaled here
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled_np = scaler.transform(X_train)
X_test_scaled_np  = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled_np)
X_test_scaled_df  = pd.DataFrame(X_test_scaled_np)
#...

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_scaled_df, y_train, cv=5)).round(2)
    scores[model_name] = mean_score
print(scores)

# We can use the result to choose the best performing model

{'Decision Tree Regressor': 0.08, 'Linear Regression': 0.5, 'KNN': 0.39}


In [17]:
val_scores = {}

for model, model_name in zip(model_pipeline,model_names):
    model.fit(X_train_scaled_df, y_train)
    val_scores[model_name] = model.score(X_test_scaled_df,y_test).round(2)
print(val_scores)

{'Decision Tree Regressor': 0.41, 'Linear Regression': 0.58, 'KNN': 0.44}


# Random Forest

In [18]:
data['subscribers'].value_counts()

subscribers
12500000     22
12400000     20
15000000     17
14500000     17
14400000     16
             ..
36500000      1
22800000      1
36200000      1
36100000      1
245000000     1
Name: count, Length: 289, dtype: int64

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

clf = RandomForestClassifier(max_depth=10,
                             min_samples_split=10,
                             min_samples_leaf =10,
                             max_samples=100,
                             random_state = 42)

clf.fit(X_train, y_train)

print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test, y_test)))

y_test_pred = clf.predict(X_test)

The Accuracy for the Random Forest in the TRAIN set is 0.08
The Accuracy for the Random Forest in the TEST  set is 0.04


# Conclusion

The data in my dataset does not allow me to train a model that can predict the variance in the column subscribers with enough accuracy. 