In this notebook, we will create and compare several models. Our target feature for this notebooks is (Likes+Comments/(Views+1))
which we will call "engagement". We have added 1 to the denonimator just to ensure we never divide by zero. 

In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"all_features_final.csv")
features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "acronym", "korean", "speed", "skills/teach", "skincare", "comparing_products", "prime_hour", "hashtags", "hasAdinTitle", "hasAdinText"]

#Create the target column $y$ here 

df["y"] = (df["likes"] + df["commentsCount"])  / (df["viewCount"] + 1) 

#get rid of noisy columns
df = df[ features + ["y"] ] 

In [21]:
#import everything

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


In [22]:
#Do an 80-20 Train Test Split Here. Never ever touch the testing set please!

df_train, df_test = train_test_split(df, shuffle = True, test_size = .2, random_state = 42) #We can't stratify because we have too many categorical features. I hope this is ok
#DO NOT TOUCH THE ABOVE X_TEST VARIABLE FOR ANY REASON

#We want a very basic idea of the MSE for each model, before we do proper cross-validation. We use a secondary split for this.
df_tt, df_ho = train_test_split(df_train, shuffle = True, test_size = .2, random_state = 42)


In [23]:
#Create the baseline model here

class BaseMeanModel():
    def __init__(self):
        self.mean_value = None
    
    def fit(self, values : pd.Series):
        self.mean_value = values.mean()

    def predict(self, inputs=None):
        if inputs is None:
            return self.mean_value
        return len(inputs) * [self.mean_value]
    
    
model = BaseMeanModel()
model.fit(df_tt["y"])

# R2 is negative because training set and the hold out set have different average values
y_pred = model.predict(df_ho[features])
rmse = root_mean_squared_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
print(f"Root Mean Squared Error: {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error: 0.037166
R-squared: -0.0000


In [24]:
#Creat the basic linear regression model here 
###Fitting the basic linear regression model using the training set and print the summary of the model.
model = LinearRegression()
model.fit(df_tt[features], df_tt["y"])
# Evaluate the model
y_pred = model.predict(df_ho[features])
rmse = root_mean_squared_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
print(f"Root Mean Squared Error: {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error: 0.036916
R-squared: 0.0134


In [25]:
#Create the basic linear regression model here with lasso regression. 

In [26]:
#Create a model whose features include all interaction terms
pipe = Pipeline([  ("interaction terms", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False) ),
                   ("linear model", LinearRegression())
]) 
#setting degree = 2 creates all pairwise interaction terms. 

pipe.fit( df_tt[features], df_tt["y"]) 
pred = pipe.predict( df_ho[features] )

print( root_mean_squared_error( df_ho["y"], pred) ) 

0.03535259217607404


In [27]:
#Create a model with all interaction terms and lasso regression 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_val_score


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_tt[features] )
X_test_scaled = scaler.transform( df_ho[features])

alpha = 0.0001
lasso = Lasso(alpha=alpha, random_state=42, max_iter=10000)

#cv_scores = cross_val_score(lasso, X_train_scaled, df_tt["y"], cv=5,scoring='neg_mean_squared_error')
#cv_rmse = np.sqrt(-cv_scores.mean())

lasso.fit(X_train_scaled, df_tt["y"])

y_pred = lasso.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(df_ho["y"], y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(df_ho["y"], y_pred)
r2 = r2_score(df_ho["y"], y_pred)
exp_var = explained_variance_score(df_ho["y"], y_pred)


print(f"\nAlpha: {alpha}")
print(f"Test MSE: {mse:.6f}, RMSE: {rmse:.6f}, MAE: {mae:.6f}")
print(f"R² Score: {r2:.6f}, Explained Variance: {exp_var:.6f}")
#print(f"Cross-validated RMSE: {cv_rmse:.6f}")




Alpha: 0.0001
Test MSE: 0.001362, RMSE: 0.036912, MAE: 0.024071
R² Score: 0.013599, Explained Variance: 0.013606


In [13]:
#Do cross-validation to compare all models 

In [14]:
#Final interpretation 