# Linear Model

## Loading the data

In [355]:
import pandas as pd
import numpy as np

In [356]:
data = pd.read_csv('../data/clean/Global_YouTube_Statistics1.csv')
data.head()

Unnamed: 0,Youtuber,category,Country,subscribers,video views,uploads,video_views_for_the_last_30_days,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days
0,T-Series,Music,India,245000000,228000000000.0,20082,2258000000.0,564600.0,9000000.0,6800000.0,108400000.0,2000000.0
1,YouTube Movies,Film & Animation,United States,170000000,0.0,1,12.0,0.0,0.05,0.04,0.58,100000.0
2,MrBeast,Entertainment,United States,166000000,28368840000.0,741,1348000000.0,337000.0,5400000.0,4000000.0,64700000.0,8000000.0
3,Cocomelon - Nursery Rhymes,Education,United States,162000000,164000000000.0,966,1975000000.0,493800.0,7900000.0,5900000.0,94800000.0,1000000.0
4,SET India,Shows,India,159000000,148000000000.0,116536,1824000000.0,455900.0,7300000.0,5500000.0,87500000.0,1000000.0


In [357]:
nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data)*100).reset_index()
nulls_percent_df.columns = ['column_name', 'nulls_percentage']
nulls_percent_df

Unnamed: 0,column_name,nulls_percentage
0,Youtuber,0.0
1,category,0.0
2,Country,0.0
3,subscribers,0.0
4,video views,0.0
5,uploads,0.0
6,video_views_for_the_last_30_days,0.0
7,lowest_monthly_earnings,0.0
8,highest_monthly_earnings,0.0
9,lowest_yearly_earnings,0.0


## Selecting y

In [358]:
y = data['subscribers']
X = data[['video views']] # I use only this column in order to predict subscribers because its value is 0.75 in the correlation matrix. The rest of the columns values is too low to be taken into account.

In [359]:
X.columns

Index(['video views'], dtype='object')

In [360]:
type(y)

pandas.core.series.Series

In [361]:
type(X)

pandas.core.frame.DataFrame

In [362]:
import sklearn

## X/y Split


Import sklearn train_test_split and separate the data. Set test_size=0.30 and random_state=31

In [363]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=158)

Separate X_train and X_test 

In [364]:
X_train_df = pd.DataFrame(X_train, columns=X.columns, index=X_train.index)
X_test_df   = pd.DataFrame(X_test,  columns=X.columns, index=X_test.index)

X_train_num = X_train_df.select_dtypes(np.number)
X_test_num  = X_test_df.select_dtypes(np.number)


## Transformer

In [365]:
import pickle
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

pt.fit(X_train_num) # fit() Determine the min and the max of each column in the TRAIN set

path = "../transformers/"
scaler_file_name = "power_transformer_scaler.pkl"

with open(path + scaler_file_name, "wb") as file:
    pickle.dump(pt, file)

X_train_scaled_num = pt.transform(X_train_num) # .transform() applies the transformation x_normalized will be np.array
X_test_scaled_num  = pt.transform(X_test_num ) # .transform() applies the transformation x_normalized will be np.array

# We create new Pandas DataFrames out of the Numpy arrays.

X_train_scaled_df_num = pd.DataFrame(X_train_scaled_num, columns=X_train_num.columns, index=X_train_num.index)
X_test_scaled_df_num  = pd.DataFrame(X_test_scaled_num,  columns=X_test_num.columns, index=X_test_num.index)

X_train_scaled_df_num.head()

Unnamed: 0,video views
463,0.3614
900,-0.925151
722,0.46159
869,-0.021488
211,1.425427


In [366]:
X_train_new = X_train_scaled_df_num 
X_test_new = X_test_scaled_df_num 
X_train_new.head()

Unnamed: 0,video views
463,0.3614
900,-0.925151
722,0.46159
869,-0.021488
211,1.425427


Fit a MinMax scaler using X_train_new and transform X_train_new and X_test_new.

In [367]:
import pickle
from sklearn.preprocessing import MinMaxScaler # Sets for each colum the minimum = 0 and the maximum = 1

scaler = MinMaxScaler()

scaler.fit(X_train_new) # fit() Determine the min and the max of each column in the TRAIN set

path = "../scalers/"
scaler_file_name = "min_max_scaler.pkl"

with open(path + scaler_file_name, "wb") as file:
    pickle.dump(scaler, file)

X_train_scaled2 = scaler.transform(X_train_new) # .transform() applies the transformation x_normalized will be np.array
X_test_scaled2  = scaler.transform(X_test_new) # .transform() applies the transformation x_normalized will be np.array

# I create new Pandas DataFrames out of the Numpy arrays.

X_train_new_scaled_df2 = pd.DataFrame(X_train_scaled2, columns=X_train_new.columns, index=X_train_new.index)
X_test_new_scaled2_df2  = pd.DataFrame(X_test_scaled2,  columns=X_train_new.columns, index=X_test_new.index)

X_train_new_scaled_df2.head()

Unnamed: 0,video views
463,0.443567
900,0.300705
722,0.454692
869,0.40105
211,0.561719


## Creation of the Linear Model

In [387]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train_new_scaled_df2, y_train)

path = "../models/"
model_file_name = "linear_model.pkl"

with open(path + model_file_name, "wb") as file:
    pickle.dump(linear_model, file)

In [369]:
y_train_pred = linear_model.predict(X_train_new_scaled_df2)
y_test_pred  = linear_model.predict(X_test_new_scaled2_df2)

In [370]:
y_train_pred

array([26224924.48547941, 14658040.75701566, 27125696.8493628 ,
       22782526.79339844, 35791176.28732453, 25653651.30226452,
       37053995.71899047, 38829002.64474487, 24320728.64101694,
       20028971.38678957, 31064412.8466837 , 25525052.0920921 ,
       18297482.72098206, 24173305.10747578, 22204241.74595594,
       32018132.02195555, 26945564.1796641 ,  9657283.78532613,
       15922316.41535954, 26656291.10381547, 39751850.95141445,
       19703435.1363415 , 19748419.87810752, 18245420.76944606,
       13704361.24228175, 26450047.85597122, 29477095.84132856,
       22393793.3858984 , 17402686.60910669, 29455763.11583106,
       19213189.70352963, 53378076.46736997, 19521720.93261228,
       25015066.21854652, 29191139.01597787, 27640609.22657278,
       17733713.36761415, 16865186.60921418, 14822657.3439323 ,
       19401207.75301745, 13333704.15929329, 22132054.08301052,
       22598951.78923497, 17290722.67063352, 17929861.66839594,
       19156648.49351322, 26139866.02622

In [371]:
y_test_pred

array([27034750.10044698, 31616310.65387674, 32406043.00974778,
       20995202.04180315, 12030269.37130465, 18605945.08895626,
       25001827.77062526, 21736975.6761807 , 18414466.19839097,
        8432565.49971511, 27935033.96196166, 25004563.00334299,
       24479525.66211905, -3206171.17326452, 24366184.16148451,
       13014726.72462025, 20074220.19198441, 15445602.59185658,
       37357361.21710008, 13470695.50878974, 32611142.56400779,
       17757836.58413982, 19076458.42479165, 22430807.06040362,
       34117289.25873765, 30596416.05462197, 31276062.00255176,
       17205800.74349105, 15392894.44150473, 22062890.76462722,
       22544035.0695678 , 35697156.03124162, 30052348.40560469,
       17366375.13056697, 14764594.20882684, 37984683.6586273 ,
       13158815.71328903, 20947349.92447929, 20586220.02158738,
       26351858.22079458,  4583876.35078205, 24947527.3827695 ,
       24691987.51526166, 20576586.25277167, 23827897.0792077 ,
       19236985.67262372, 29950110.23637

## Error Metrics Functions

In [372]:
import pandas as pd

def create_dataframe_errors(y_true, y_pred):
    """"
    This function inputs the arrays, ensures that y_true and y_pred have the same length, Calculates the error, Creates a dictionary, and Creates a pandas DataFrame from the dictionary.
    """
    # Ensure that y_true and y_pred have the same length
    assert len(y_true) == len(y_pred), "Length of y_true and y_pred must be the same"

    # Calculate the error
    error = [abs(true - pred) for true, pred in zip(y_true, y_pred)]

    # Create a dictionary
    data = {
        "Real Values": y_true,
        "Predicted Values": y_pred,
        "Error": error
    }

    # Create a pandas DataFrame from the dictionary
    df = pd.DataFrame(data)

    return df

In [373]:
create_dataframe_errors(y_train, y_train_pred)

Unnamed: 0,Real Values,Predicted Values,Error
463,18400000,2.622492e+07,7.824924e+06
900,13000000,1.465804e+07,1.658041e+06
722,14700000,2.712570e+07,1.242570e+07
869,13300000,2.278253e+07,9.482527e+06
211,26400000,3.579118e+07,9.391176e+06
...,...,...,...
685,15000000,2.374394e+07,8.743942e+06
782,14100000,1.318644e+07,9.135628e+05
376,20400000,2.224827e+07,1.848268e+06
718,14700000,1.749122e+07,2.791216e+06


In [374]:
create_dataframe_errors(y_test, y_test_pred)

Unnamed: 0,Real Values,Predicted Values,Error
476,18100000,2.703475e+07,8.934750e+06
798,14000000,3.161631e+07,1.761631e+07
275,23700000,3.240604e+07,8.706043e+06
509,17600000,2.099520e+07,3.395202e+06
664,15100000,1.203027e+07,3.069731e+06
...,...,...,...
961,12500000,2.245701e+07,9.957005e+06
712,14800000,2.964438e+07,1.484438e+07
584,16200000,1.757748e+07,1.377484e+06
382,20300000,2.600051e+07,5.700506e+06


In [377]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def calculate_error_metrics(y_real_train, y_real_test, y_pred_train, y_pred_test)-> pd.DataFrame:
    """
    This function inputs the arrays: y_real_train, y_real_test, y_pred_train, y_pred_test and then calculates the error metrics mae, mse, rmse, r2 for the train and test set and returns a data frame.

    """
    mae_train = mean_absolute_error(y_real_train, y_pred_train)
    mse_train = mean_squared_error(y_real_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_real_train, y_pred_train)

    mae_test = mean_absolute_error(y_real_test, y_pred_test)
    mse_test = mean_squared_error(y_real_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_real_test, y_pred_test)

    error_metrics_df = pd.DataFrame({
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Train': [mae_train, mse_train, rmse_train, r2_train],
        'Test': [mae_test, mse_test, rmse_test, r2_test]
    })

    return error_metrics_df

In [378]:
calculate_error_metrics(y_train, y_test, y_train_pred, y_test_pred)

Unnamed: 0,Metric,Train,Test
0,MAE,8112169.0,7830551.0
1,MSE,256219500000000.0,187017100000000.0
2,RMSE,16006860.0,13675420.0
3,R2,0.2398189,0.2094594


The performance of this model is not good at all. I have to keep exploring different options.

In [379]:
def train_models(models: list, X_train, y_train)-> list:
    trained_models = []
    for model in models:
        model.fit(X_train, y_train)
        trained_models.append(model)
    return trained_models


In [380]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

models = [LinearRegression(), KNeighborsRegressor(), MLPRegressor(max_iter=500, tol=0.0001)]

In [381]:
trained_models = train_models(models, X_train_scaled2, y_train)
for model in trained_models:
    y_train_pred = model.predict(X_train_scaled2)
    y_test_pred = model.predict(X_test_new_scaled2_df2)

    error_metrics = calculate_error_metrics(y_train, y_test, y_train_pred, y_test_pred)

    # train_error_metrics = calculate_error_metrics(y_train, y_train_pred)
    # test_error_metrics = calculate_error_metrics(y_test, y_test_pred)

    display(error_metrics)
 



Unnamed: 0,Metric,Train,Test
0,MAE,8112169.0,7830551.0
1,MSE,256219500000000.0,187017100000000.0
2,RMSE,16006860.0,13675420.0
3,R2,0.2398189,0.2094594




Unnamed: 0,Metric,Train,Test
0,MAE,5292931.0,6645418.0
1,MSE,104377600000000.0,132115400000000.0
2,RMSE,10216540.0,11494150.0
3,R2,0.6903207,0.4415345




Unnamed: 0,Metric,Train,Test
0,MAE,22975160.0,22997440.0
1,MSE,864907900000000.0,765450200000000.0
2,RMSE,29409320.0,27666770.0
3,R2,-1.566107,-2.235636


# Decision Trees

In [382]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import dtreeviz
import graphviz
import graphviz.backend as be
from IPython.display import Image, display_svg, SVG
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

This is just a decision tree taking into account the previous x/y split. On the next book I go deeper into decision tree model.

In [383]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("Test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.78
Test data R2 was: 0.57


## Saving the functions

In [388]:
%%writefile functions.py

import pandas as pd
import numpy as np

def create_dataframe_errors(y_true, y_pred):
    """"
    This function inputs the arrays, ensures that y_true and y_pred have the same length, Calculates the error, Creates a dictionary, and Creates a pandas DataFrame from the dictionary.
    """
    # Ensure that y_true and y_pred have the same length
    assert len(y_true) == len(y_pred), "Length of y_true and y_pred must be the same"

    # Calculate the error
    error = [abs(true - pred) for true, pred in zip(y_true, y_pred)]

    # Create a dictionary
    data = {
        "Real Values": y_true,
        "Predicted Values": y_pred,
        "Error": error
    }

    # Create a pandas DataFrame from the dictionary
    df = pd.DataFrame(data)

    return df

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def calculate_error_metrics(y_real_train, y_real_test, y_pred_train, y_pred_test)-> pd.DataFrame:
    """
    This function inputs the arrays: y_real_train, y_real_test, y_pred_train, y_pred_test and then calculates the error metrics mae, mse, rmse, r2 for the train and test set and returns a data frame.

    """
    mae_train = mean_absolute_error(y_real_train, y_pred_train)
    mse_train = mean_squared_error(y_real_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_real_train, y_pred_train)

    mae_test = mean_absolute_error(y_real_test, y_pred_test)
    mse_test = mean_squared_error(y_real_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_real_test, y_pred_test)

    error_metrics_df = pd.DataFrame({
        'Metric': ['MAE', 'MSE', 'RMSE', 'R2'],
        'Train': [mae_train, mse_train, rmse_train, r2_train],
        'Test': [mae_test, mse_test, rmse_test, r2_test]
    })

    return error_metrics_df


def train_models(models: list, X_train, y_train)-> list:
    trained_models = []
    for model in models:
        model.fit(X_train, y_train)
        trained_models.append(model)
    return trained_models

Writing functions.py
