In [1]:
#pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import re
import pickle
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
pd.set_option('display.max_columns',None)

In [3]:
youtube = pd.read_csv('cleaned_data.csv')
youtube.head(3)

Unnamed: 0,title,daily_rank,daily_movement,weekly_movement,country,view_count,like_count,comment_count,description,video_id,channel_id,video_tags,publish_date,langauge,channel_title,category_id,definition,caption_status,title_processed,description_processed,video_tags_processed,channel_title_processed,publish_year,publish_month,publish_day,publish_dayofweek,duration_seconds
0,Discord Loot Boxes are here.,1,49,49,US,1407108038,69908,7789,Why would we ever bring Loot Boxes into a chat...,cc2-4ci4G84,UCZ5XnGb-3t7jCkXdawN2tkA,-,2024-04-01 00:00:00+00:00,en,Discord,24.0,hd,False,discord loot box,would ever bring loot box chat app open discor...,,discord,2024,4,1,0,18
1,Grand Theft Auto VI Trailer 1,14,-9,36,US,130980218,10301883,802501,Song: Love Is A Long Road\nArtist: Tom Petty\n...,QdBZY2fkU-0,UC6VcWc1rAoWdBCM0JxrRQ3A,"Rockstar Games, Grand Theft Auto VI, GTAVI, GT...",2023-12-04 00:00:00+00:00,en,Rockstar Games,20.0,hd,True,grand theft auto vi trailer,song love long road artist tom petty written t...,rockstar game grand theft auto vi gtavi gta gt...,rockstar game,2023,12,4,0,91
2,I Survived 7 Days In An Abandoned City,50,-29,0,US,99937514,4271480,160164,This was one of the hardest challenges weâve...,tWYsfOSY9vY,UCX6OQ3DkcsbYNE6H8uQQuVA,-,2024-03-02 00:00:00+00:00,en,MrBeast,24.0,hd,True,survived day abandoned city,one hardest challenge weve ever done deal good...,,mrbeast,2024,3,2,5,1044


In [4]:
youtube.isna().sum()

title                         0
daily_rank                    0
daily_movement                0
weekly_movement               0
country                       0
view_count                    0
like_count                    0
comment_count                 0
description                   0
video_id                      0
channel_id                    0
video_tags                    0
publish_date                  0
langauge                      0
channel_title                 0
category_id                   0
definition                    0
caption_status                0
title_processed               7
description_processed        62
video_tags_processed       1712
channel_title_processed      17
publish_year                  0
publish_month                 0
publish_day                   0
publish_dayofweek             0
duration_seconds              0
dtype: int64

In [5]:
youtube = youtube.replace(np.nan, ' ')
youtube.isna().sum().sum()

0

In [6]:
# Assuming `youtube` is your DataFrame
youtube = youtube.drop(columns=['title', 'description', 'channel_title', 'video_tags', 'video_id', 'channel_id','category_id'], axis=1)
youtube.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10246 entries, 0 to 10245
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   daily_rank               10246 non-null  int64 
 1   daily_movement           10246 non-null  int64 
 2   weekly_movement          10246 non-null  int64 
 3   country                  10246 non-null  object
 4   view_count               10246 non-null  int64 
 5   like_count               10246 non-null  int64 
 6   comment_count            10246 non-null  int64 
 7   publish_date             10246 non-null  object
 8   langauge                 10246 non-null  object
 9   definition               10246 non-null  object
 10  caption_status           10246 non-null  bool  
 11  title_processed          10246 non-null  object
 12  description_processed    10246 non-null  object
 13  video_tags_processed     10246 non-null  object
 14  channel_title_processed  10246 non-nul

# Use Case 2:

# Predict Video Popularity with Interactions

In [9]:
# Define numerical, categorical, and text features
numerical_features = ['duration_seconds','daily_rank','daily_movement','weekly_movement','like_count','comment_count']
categorical_features = ['country', 'langauge', 'definition', 'caption_status', 'publish_year', 'publish_month', 'publish_day', 'publish_dayofweek']
text_features = ['title_processed', 'description_processed', 'video_tags_processed', 'channel_title_processed']

# Check for missing columns
missing_columns = set(numerical_features + categorical_features + text_features) - set(youtube.columns)
if missing_columns:
    print(f"Missing columns in the DataFrame: {missing_columns}")
else:
    print("All required columns are present.")

# Apply log transformation to like_count and comment_count
youtube['like_count'] = np.log1p(youtube['like_count'])
youtube['comment_count'] = np.log1p(youtube['comment_count'])

# Define preprocessing steps
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

text_transformers = [(text_feature, Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=15000))]), text_feature) for text_feature in text_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ] + text_transformers)

# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'K Nearest Neighbors': KNeighborsRegressor(),
    'Support Vector Machine': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

# Split the data into training and testing sets
X = youtube.drop(columns=['view_count'])  # Features
y = np.log1p(youtube['view_count'])  # Log-transformed target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train, evaluate, and pickle each model
for name, model in models.items():
    print(f"Training {name}...")
    # Create a pipeline with the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = mse ** 0.5
    print(f"{name} - Mean Squared Error: {mse}, R^2 Score: {r2}, RMSE: {rmse}")
    
    # Pickle the model
    model_filename = f"{name.replace(' ', '_').lower()}_1.pkl"
    with open(model_filename, 'wb') as file:
        pickle.dump(pipeline, file)
        print(f"{name} model saved as {model_filename}")

print("All models trained and saved.")


All required columns are present.
Training Linear Regression...
Linear Regression - Mean Squared Error: 0.46089019075834176, R^2 Score: 0.6972873487966924, RMSE: 0.6788889384563146
Linear Regression model saved as linear_regression_1.pkl
Training Ridge Regression...
Ridge Regression - Mean Squared Error: 0.27297721331756214, R^2 Score: 0.8207085817437644, RMSE: 0.5224722129621461
Ridge Regression model saved as ridge_regression_1.pkl
Training Lasso Regression...
Lasso Regression - Mean Squared Error: 1.5238811368763332, R^2 Score: -0.0008850444474250985, RMSE: 1.2344558059632322
Lasso Regression model saved as lasso_regression_1.pkl
Training K Nearest Neighbors...
K Nearest Neighbors - Mean Squared Error: 0.5242938267196654, R^2 Score: 0.655643844285994, RMSE: 0.7240813674716851
K Nearest Neighbors model saved as k_nearest_neighbors_1.pkl
Training Support Vector Machine...
Support Vector Machine - Mean Squared Error: 0.23285886048160317, R^2 Score: 0.8470583136156898, RMSE: 0.482554515

In [None]:
# Add text length features
for feature in text_features:
    youtube[feature + '_length'] = youtube[feature].apply(lambda x: len(str(x).split()))

# Add the text length features to the list of numerical features
numerical_features += [feature + '_length' for feature in text_features]

# Log transform the target variable
youtube['log_view_count'] = np.log1p(youtube['view_count'])


print("Columns after adding length features:", youtube.columns)

# Check if all necessary columns are in the DataFrame
missing_columns = set(numerical_features + categorical_features + text_features) - set(youtube.columns)
if missing_columns:
    print(f"Missing columns in the DataFrame: {missing_columns}")
else:
    print("All required columns are present.")

# Define preprocessing steps for different types of features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a separate pipeline for each text feature
text_transformers = [(text_feature, Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=5000))]), text_feature) for text_feature in text_features]

# Combine preprocessing steps for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ] + text_transformers)

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Combine preprocessing and modeling into a single pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and testing sets
X = youtube.drop(columns=['view_count', 'log_view_count'])  # Features
y = youtube['log_view_count']  # Log-transformed target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
print("Fitting the pipeline...")
try:
    pipeline.fit(X_train, y_train)
    print("Pipeline fitted successfully.")
except ValueError as e:
    print(f"Error during pipeline fitting: {e}")

# Evaluate the model
try:
    y_pred = pipeline.predict(X_test)
    #mse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred))  # Reverse log transformation for MSE calculation
    mse = mean_squared_error(y_test,y_pred)
    #r2 = r2_score(np.expm1(y_test), np.expm1(y_pred))  # Reverse log transformation for R² calculation
    r2 = r2_score(y_test, y_pred)
    rmse = mse ** 0.5
    print("Mean Squared Error:", mse)
    print("R^2 Score:", r2)
    print("Root Mean Squared Error:",rmse)
except ValueError as e:
    print(f"Error during prediction: {e}")

# -----------------------------------------------------------------------------------------------------------