In this notebook, we will create and compare several models. Our target feature for this notebooks is LOG(VIEWS/SUBSCRIBERS))

Please note that we are not really creating a "model" of our data. That is, we do not believe our data is fully explained by our features nor that there is underlying "true" relationship that is linear. We are not using linear regression to predict the data or model the data. Instead, we are finding the line of best fit and using it as an indication of trend. 

In [43]:
import pandas as pd
import numpy as np
import math 

df = pd.read_csv(r"no_early_dates_90_days.csv")
df.columns

#PLEASE NOTE THAT THE HASHTAGS COLUMN CURRENTLY HAS THE NUMBER OF HASHTAGS USED, AND IS NOT A CATEGORICAL VARIABLE. 
#WHEN CONSIDERING INTERACTION TERMS PLEASE ONLY INCLUDE PAIRWISE INTERACTION TERMS. More interaction terms than this would create extremely small and nonexistent categories which we do not want.

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       'channelDescription', 'channelJoinedDate', 'channelTotalVideos',
       'channelTotalViews', 'channelUsername', 'commentsCount', 'date',
       'duration', 'id', 'isChannelVerified', 'likes', 'numberOfSubscribers',
       'order', 'text', 'title', 'url', 'viewCount', 'likes_per_subscriber',
       'comments_per_subscriber', 'views_per_subscriber',
       'duration_in_seconds', 'datetime_date', 'hashtags', 'comm_to_views',
       'likes_to_views', 'popular_brand', 'has_title_affiliate',
       'has_description_affiliate', 'has_channel_description_affiliate',
       'has_any_affiliate', 'has_business_inquiry',
       'engagement_per_subscriber', 'product', 'budget', 'self_ref', 'acronym',
       'korean', 'speed', 'skills/teach', 'skincare', 'comparing_products',
       'datetime', 'hour', 'day_of_week', 'month', 'year', 'day_name',
       'engagement_rate', 'prime_time', 'cluster1', 'prime_time1',
       'postin

In [44]:
#Don't run this cell more than once

features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "acronym", "korean", "speed", 
            "skills/teach", "skincare", "comparing_products", "prime_hour", "hashtags", "hasAdinTitle", "hasAdinText"]

#Create the target column $y$ here:
df["y"] = df["views_per_subscriber"].apply( math.log )

#We don't need a lot of the noise columns
df = df[ features + ["y"] ] 

In [45]:
#Import everything

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


In [46]:
#Do an 80-20 Train Test Split Here. Never ever touch the testing set please!
cat_features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "acronym", "korean", "speed", "skills/teach", "skincare", "comparing_products", "prime_hour", "hasAdinTitle", "hasAdinText"]
#The above is just everything except "hashtags"

df_train, df_test = train_test_split(df, shuffle = True, test_size = .2, random_state = 42) #We can't stratify because we have too many categorical features. I hope this is ok
#DO NOT TOUCH THE ABOVE X_TEST VARIABLE FOR ANY REASON

#We want a very basic idea of the MSE for each model, before we do proper cross-validation. We use a secondary split for this.
df_tt, df_ho = train_test_split(df_train, shuffle = True, test_size = .2, random_state = 42)


In [47]:
#Create the baseline model here

class BaseMeanModel():
    def __init__(self):
        self.mean_value = None
    
    def fit(self, values : pd.Series):
        self.mean_value = values.mean()

    def predict(self, inputs=None):
        if inputs is None:
            return self.mean_value
        return len(inputs) * [self.mean_value]
    
model = BaseMeanModel()
model.fit(df_tt["y"])

# R2 is negative because training set and the hold out set have different average values
y_pred = model.predict(df_ho[features])
rmse = root_mean_squared_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
print(f"Root Mean Squared Error: {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error: 1.644687
R-squared: -0.0044


In [48]:
#Create the basic linear regression model here 
model = LinearRegression()
model.fit(df_tt[features], df_tt["y"])
# Evaluate the model
y_pred = model.predict(df_ho[features])
rmse = root_mean_squared_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
print(f"Root Mean Squared Error (Log Views): {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error (Log Views): 1.538600
R-squared: 0.1210


In [49]:
#Create the basic linear regression model here with lasso regression. 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_val_score


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_tt[features] )
X_test_scaled = scaler.transform( df_ho[features])


alpha = 0.0001

lasso = Lasso(alpha=alpha, random_state=42, max_iter=10000)


lasso.fit(X_train_scaled, df_tt["y"])

y_pred = lasso.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(df_ho["y"], y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(df_ho["y"] ,y_pred)
r2 = r2_score(df_ho["y"], y_pred)
exp_var = explained_variance_score(df_ho["y"], y_pred)

print(f"\nAlpha: {alpha}")
print(f"Test MSE: {mse:.6f}, RMSE: {rmse:.6f}, MAE: {mae:.6f}")
print(f"R² Score: {r2:.6f}, Explained Variance: {exp_var:.6f}")


Alpha: 0.0001
Test MSE: 2.367330, RMSE: 1.538613, MAE: 1.188627
R² Score: 0.120950, Explained Variance: 0.126585


In [50]:
#Create a model whose features include all interaction terms 
pipe = Pipeline([  ("interaction terms", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False) ),
                   ("linear model", LinearRegression())
]) 
#setting degree = 2 creates all pairwise interaction terms. 

pipe.fit( df_tt[features], df_tt["y"]) 
pred = pipe.predict( df_ho[features] )

mse = mean_squared_error(df_ho["y"], pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(df_ho["y"], pred)
r2 = r2_score(df_ho["y"], pred)
exp_var = explained_variance_score(df_ho["y"], pred)


print(f"Test MSE: {mse:.6f}, RMSE: {rmse:.6f}, MAE: {mae:.6f}")
print(f"R² Score: {r2:.6f}, Explained Variance: {exp_var:.6f}")

Test MSE: 2.356489, RMSE: 1.535086, MAE: 1.175820
R² Score: 0.124976, Explained Variance: 0.130502


In [51]:
#Create a model with all interaction terms and lasso regression 

#Create a model with all interaction terms and lasso regression 
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df_tt = scaler.fit_transform( df_tt[features] )
scaled_df_ho = scaler.transform( df_ho[features] )

# using lasso cv to find the best alpha
# lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=np.logspace(-4, 1, 30))
# lasso.fit(df_tt[features], df_tt["y"])
# pred = lasso.predict(df_ho[features])
# print("Lasso CV MSE:", root_mean_squared_error(df_ho["y"], pred))
# print("Optimal alpha:", lasso.alpha_)

pipe = Pipeline([
    ("interaction terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe.fit(scaled_df_tt, df_tt["y"])
pred = pipe.predict(scaled_df_ho)
print("MSE : ", root_mean_squared_error(df_ho["y"], pred))

# Get lasso coefficients
lasso_coeffs = pd.Series(pipe.named_steps['lasso'].coef_, index=pipe.named_steps['interaction terms'].get_feature_names_out(features))
lasso_coeffs = lasso_coeffs[lasso_coeffs != 0]
# print(lasso_coeffs)

MSE :  1.5349475471917438


In [52]:
#Do cross-validation to compare all models 

#Model 0: Baseline Average
#Model 1: Basic Linear Regression Model
#Model 2: Linear Regression with Lasso
#Model 3: Linear Regression with interaction terms
#Model 4: Linear Regression with interactions and lasso

from sklearn.model_selection import KFold
num_splits = 5
num_models = 5

kfold = KFold(num_splits,
              random_state = 42,
              shuffle=True)

rmses = np.zeros((num_models, num_splits))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)): 

    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index] 

    #Model 0: Baseline Average
    model = BaseMeanModel()
    model.fit(df_tt["y"])
    y_pred = model.predict(df_ho[features])
    rmses[0,i] = root_mean_squared_error(df_ho["y"], y_pred)

    #Model 1: Basic Linear Regression Model
    model = LinearRegression()
    model.fit(df_tt[features], df_tt["y"])
    y_pred = model.predict(df_ho[features])
    rmses[1,i] = root_mean_squared_error(df_ho["y"], y_pred)

    #Model 2: Linear Regression with Lasso
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(df_tt[features] )
    X_test_scaled = scaler.transform( df_ho[features])
    alpha = 0.0001
    lasso = Lasso(alpha=alpha, random_state=42, max_iter=10000)
    lasso.fit(X_train_scaled, df_tt["y"])
    y_pred = lasso.predict(X_test_scaled)
    rmses[2,i] = root_mean_squared_error(df_ho["y"], y_pred)

    #Model 3: Linear Regression with interaction terms
    pipe = Pipeline([  ("interaction terms", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False) ),
                   ("linear model", LinearRegression())
                    ]) 
    pipe.fit( df_tt[features], df_tt["y"]) 
    y_pred = pipe.predict( df_ho[features] )
    rmses[3,i] = root_mean_squared_error(df_ho["y"], y_pred)

    #Model 4: Linear Regression with interactions and lasso
    scaler = StandardScaler()
    scaled_df_tt = scaler.fit_transform( df_tt[features] )
    scaled_df_ho = scaler.transform( df_ho[features] )
    pipe = Pipeline([
    ("interaction terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
                    ])
    pipe.fit(scaled_df_tt, df_tt["y"])
    y_pred = pipe.predict(scaled_df_ho)
    rmses[4,i] = root_mean_squared_error(df_ho["y"], y_pred)

print(rmses)

[[1.64468734 1.57164797 1.56909547 1.64753598 1.70829604]
 [1.53860029 1.48638939 1.49762178 1.56249385 1.63738036]
 [1.53861317 1.48638776 1.49760963 1.562494   1.63735979]
 [1.53508614 1.45829658 1.48308619 1.55340947 1.68253152]
 [1.53494755 1.45822724 1.48293006 1.55322598 1.68145669]]


In [53]:
rmses.mean(axis = 1)

array([1.62825256, 1.54449714, 1.54449287, 1.54248198, 1.5421575 ])

#Final interpretation. We'll look at coefficients for our best model and compare. 

---Final Interpretation---

Almost as expected, the model with the most features (i.e., the model that includes all interaction terms) performed the best. This is this could just be because every time a linear model has more features it will perform better, in the sense that it will have a lower rmse. 

Interestingly, our lasso model slightly outperformed our non-lasso model. 

We want to take the coefficients of our highest performing model and compare them. This is where our analysis was really headed.
Mathematically, the features with the highest coefficients are those that contribute the most to the trend line.

In [72]:
#A final fitting to the whole data set 
scaler = StandardScaler()
scaled_df = scaler.fit_transform( df[features] )
pipe = Pipeline([
    ("interaction terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
                    ])
pipe.fit(scaled_df, df["y"])

lasso_coeffs = pd.Series(pipe.named_steps['lasso'].coef_, index=pipe.named_steps['interaction terms'].get_feature_names_out(features))
sig_lasso_coeffs = lasso_coeffs[lasso_coeffs != 0]
print(sig_lasso_coeffs)


popular_brand               0.097420
has_any_affiliate          -0.119768
product                     0.134183
budget                      0.088134
self_ref                    0.022883
                              ...   
prime_hour hasAdinTitle    -0.002207
prime_hour hasAdinText      0.016012
hashtags hasAdinTitle       0.099113
hashtags hasAdinText       -0.010577
hasAdinTitle hasAdinText   -0.075326
Length: 120, dtype: float64


In [74]:
sig_lasso_coeffs.sort_values(key=abs)

acronym hasAdinTitle               0.000234
skincare comparing_products       -0.000679
product speed                     -0.001106
skills/teach comparing_products   -0.002053
prime_hour hasAdinTitle           -0.002207
                                     ...   
acronym                            0.154045
has_any_affiliate hashtags         0.219883
hashtags                           0.223845
acronym hashtags                   0.250303
korean                             0.261197
Length: 120, dtype: float64

In [78]:
new = sig_lasso_coeffs.sort_values(key=abs)
new.tail(10)

hasAdinText                  -0.104489
has_any_affiliate speed       0.107565
has_any_affiliate            -0.119768
product                       0.134183
self_ref hashtags             0.147005
acronym                       0.154045
has_any_affiliate hashtags    0.219883
hashtags                      0.223845
acronym hashtags              0.250303
korean                        0.261197
dtype: float64