# In this notebook, we will create and compare several models. Our target feature for this notebooks is LOG(VIEWS/SUBSCRIBERS))

In [1]:
import pandas as pd
import numpy as np
import math 

df = pd.read_csv(r"no_early_dates_90_days.csv")
df.columns

#PLEASE NOTE THAT THE HASHTAGS COLUMN CURRENTLY HAS THE NUMBER OF HASHTAGS USED, AND IS NOT A CATEGORICAL VARIABLE. 
#WHEN CONSIDERING INTERACTION TERMS PLEASE ONLY INCLUDE PAIRWISE INTERACTION TERMS. More interaction terms than this would create extremely small and nonexistent categories which we do not want.

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       'channelDescription', 'channelJoinedDate', 'channelTotalVideos',
       'channelTotalViews', 'channelUsername', 'commentsCount', 'date',
       'duration', 'id', 'isChannelVerified', 'likes', 'numberOfSubscribers',
       'order', 'text', 'title', 'url', 'viewCount', 'likes_per_subscriber',
       'comments_per_subscriber', 'views_per_subscriber',
       'duration_in_seconds', 'datetime_date', 'hashtags', 'comm_to_views',
       'likes_to_views', 'popular_brand', 'has_title_affiliate',
       'has_description_affiliate', 'has_channel_description_affiliate',
       'has_any_affiliate', 'has_business_inquiry',
       'engagement_per_subscriber', 'product', 'budget', 'self_ref', 'acronym',
       'korean', 'speed', 'skills/teach', 'skincare', 'comparing_products',
       'datetime', 'hour', 'day_of_week', 'month', 'year', 'day_name',
       'engagement_rate', 'prime_time', 'cluster1', 'prime_time1',
       'postin

In [2]:
#Don't run this cell more than once

features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "acronym", "korean", "speed", "skills/teach", "skincare", "comparing_products", "prime_hour", "hashtags", "hasAdinTitle", "hasAdinText"]

#Create the target column $y$ here:
df["y"] = df["views_per_subscriber"].apply( math.log )

#We don't need a lot of the noise columns
df = df[ features + ["y"] ] 

In [3]:
#Import everything

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


In [4]:
#Do an 80-20 Train Test Split Here. Never ever touch the testing set please!
cat_features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "acronym", "korean", "speed", "skills/teach", "skincare", "comparing_products", "prime_hour", "hasAdinTitle", "hasAdinText"]
#The above is just everything except "hashtags"

df_train, df_test = train_test_split(df, shuffle = True, test_size = .2) #We can't stratify because we have too many categorical features. I hope this is ok
#DO NOT TOUCH THE ABOVE X_TEST VARIABLE FOR ANY REASON

#We want a very basic idea of the MSE for each model, before we do proper cross-validation. We use a secondary split for this.
df_tt, df_ho = train_test_split(df_train, shuffle = True, test_size = .2)


In [5]:
#Create the baseline model here

In [None]:
#Creat the basic linear regression model here 
model = LinearRegression()
model.fit(df_tt[features], df_tt["y"])
# Evaluate the model
y_pred = model.predict(df_ho[features])
rmse = root_mean_squared_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
print(f"Root Mean Squared Error (Log Views): {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

In [7]:
#Create the basic linear regression model here with lasso regression. 

In [8]:
#Create a model whose features include all interaction terms 
pipe = Pipeline([  ("interaction terms", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False) ),
                   ("linear model", LinearRegression())
]) 
#setting degree = 2 creates all pairwise interaction terms. 

pipe.fit( df_tt[features], df_tt["y"]) 
pred = pipe.predict( df_ho[features] )

print( root_mean_squared_error( df_ho["y"], pred) ) 

1.5101863989903899


In [13]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_tt[features] = scaler.fit_transform(df_tt[features])
df_ho[features] = scaler.transform(df_ho[features])

#Create a model whose features include all interaction terms and a lasso regression
from sklearn.linear_model import LassoCV, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

# # using lasso cv to find the best alpha
# lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=np.logspace(-4, 1, 30))
# lasso.fit( df_tt[features], df_tt["y"])
# pred = lasso.predict( df_ho[features] )
# print("Lasso CV MSE : ",root_mean_squared_error( df_ho["y"], pred) )
# print("Optimal Alpha : ", lasso.alpha_ )


pipe = Pipeline([  ("interaction terms", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False) ),
                   ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe.fit( df_tt[features], df_tt["y"])
pred = pipe.predict( df_ho[features] )
print("MSE : ",root_mean_squared_error( df_ho["y"], pred) )

# Get lasso coefficients
lasso_coef = pd.Series(pipe.named_steps['lasso'].coef_, index=pipe.named_steps['interaction terms'].get_feature_names_out(features))
lasso_coef = lasso_coef[lasso_coef != 0]
# print(lasso_coef)

MSE :  1.510058726098824


In [10]:
#Do cross-validation to compare all models 

In [11]:
#Final interpretation. We'll look at coefficients for our best model and compare. 