In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [35]:
df =  pd.read_csv("no_early_dates_90_days.csv")
df = df.rename(columns={"skills/teach": "skills"})

In [36]:
df.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       'channelDescription', 'channelJoinedDate', 'channelTotalVideos',
       'channelTotalViews', 'channelUsername', 'commentsCount', 'date',
       'duration', 'id', 'isChannelVerified', 'likes', 'numberOfSubscribers',
       'order', 'text', 'title', 'url', 'viewCount', 'likes_per_subscriber',
       'comments_per_subscriber', 'views_per_subscriber',
       'duration_in_seconds', 'datetime_date', 'hashtags', 'comm_to_views',
       'likes_to_views', 'popular_brand', 'has_title_affiliate',
       'has_description_affiliate', 'has_channel_description_affiliate',
       'has_any_affiliate', 'has_business_inquiry',
       'engagement_per_subscriber', 'product', 'budget', 'self_ref', 'acronym',
       'korean', 'speed', 'skills', 'skincare', 'comparing_products',
       'datetime', 'hour', 'day_of_week', 'month', 'year', 'day_name',
       'engagement_rate', 'prime_time', 'cluster1', 'prime_time1',
       'posting_time

In [37]:

df["engagement"] = (df["likes"] + df["commentsCount"])  / (df["viewCount"] + 1) 

df["normalized_view_conversion"] = np.log(df["viewCount"]/df["numberOfSubscribers"])

In [38]:
all_features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref",
                 "acronym", "korean", "speed", "skills", "skincare", "comparing_products", 
                 "prime_hour", "hashtags", "hasAdinTitle", "hasAdinText"]

In [None]:
#changed random_state to 42 to match Gabriela? 

df_train, df_test = train_test_split(df, shuffle = True, test_size = .2, random_state = 42)

df_tt, df_ho = train_test_split(df_train, shuffle = True, test_size = .2, random_state = 42)

In [40]:
#added since Gabriela evaluated the MLR with RMSE
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [41]:
class BaseMeanModel():
    def __init__(self):
        self.mean_value = None
    
    def fit(self, values : pd.Series):
        self.mean_value = values.mean()

    def predict(self, inputs=None):
        if inputs is None:
            return self.mean_value
        return len(inputs) * [self.mean_value]

In [None]:
#Create the baseline model here for target engagement

model = BaseMeanModel()
model.fit(df_tt["engagement"])

In [None]:
# Evaluate the model for target variable "engagement"
# R2 is negative because training set and the hold out set have different average values
y_pred = model.predict(df_ho[all_features])
rmse = root_mean_squared_error(df_ho["engagement"], y_pred)
r2 = r2_score(df_ho["engagement"], y_pred)
print(f"Root Mean Squared Error: {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error: 0.034129
R-squared: -0.0035


In [44]:
#Create the baseline model here for target log(view/subscriber)

model = BaseMeanModel()
model.fit(df_tt["normalized_view_conversion"])

In [None]:
# Evaluate the model for target variable log(view/subscriber)
# R2 is negative because training set and the hold out set have different average values
y_pred = model.predict(df_ho[all_features])
rmse = root_mean_squared_error(df_ho["normalized_view_conversion"], y_pred)
r2 = r2_score(df_ho["normalized_view_conversion"], y_pred)
print(f"Root Mean Squared Error: {rmse:.6f}")
print(f"R-squared: {r2:.4f}")

Root Mean Squared Error: 1.644687
R-squared: -0.0044
