In [1]:
import pickle
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
youtube = pd.read_csv('cleaned_data.csv')
youtube.head(3)

Unnamed: 0,title,daily_rank,daily_movement,weekly_movement,country,view_count,like_count,comment_count,description,video_id,...,langauge,channel_title,duration,category_id,definition,caption_status,title_processed,description_processed,video_tags_processed,channel_title_processed
0,Discord Loot Boxes are here.,1,49,49,US,1407108038,69908,7789,Why would we ever bring Loot Boxes into a chat...,cc2-4ci4G84,...,en,Discord,PT18S,24.0,hd,False,discord loot box,would ever bring loot box chat app open discor...,,discord
1,Grand Theft Auto VI Trailer 1,14,-9,36,US,130980218,10301883,802501,Song: Love Is A Long Road\nArtist: Tom Petty\n...,QdBZY2fkU-0,...,en,Rockstar Games,PT1M31S,20.0,hd,True,grand theft auto vi trailer,song love long road artist tom petty written t...,rockstar game grand theft auto vi gtavi gta gt...,rockstar game
2,I Survived 7 Days In An Abandoned City,50,-29,0,US,99937514,4271480,160164,This was one of the hardest challenges weâve...,tWYsfOSY9vY,...,en,MrBeast,PT17M24S,24.0,hd,True,survived day abandoned city,one hardest challenge weve ever done deal good...,,mrbeast


In [3]:
youtube = youtube.replace(np.nan, ' ')
youtube.isna().sum().sum()

0

In [4]:
# Parse dates and extract features
youtube['publish_date'] = pd.to_datetime(youtube['publish_date'])
youtube['publish_year'] = youtube['publish_date'].dt.year
youtube['publish_month'] = youtube['publish_date'].dt.month
youtube['publish_day'] = youtube['publish_date'].dt.day
youtube['publish_dayofweek'] = youtube['publish_date'].dt.dayofweek
youtube = youtube.drop(columns=['publish_date'])

# Convert duration to seconds
def parse_duration(duration):
    match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
    if not match:
        return 0
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    seconds = int(match.group(3)) if match.group(3) else 0
    return hours * 3600 + minutes * 60 + seconds

youtube['duration_seconds'] = youtube['duration'].apply(parse_duration)
youtube = youtube.drop(columns=['duration'])

youtube

Unnamed: 0,title,daily_rank,daily_movement,weekly_movement,country,view_count,like_count,comment_count,description,video_id,...,caption_status,title_processed,description_processed,video_tags_processed,channel_title_processed,publish_year,publish_month,publish_day,publish_dayofweek,duration_seconds
0,Discord Loot Boxes are here.,1,49,49,US,1407108038,69908,7789,Why would we ever bring Loot Boxes into a chat...,cc2-4ci4G84,...,False,discord loot box,would ever bring loot box chat app open discor...,,discord,2024,4,1,0,18
1,Grand Theft Auto VI Trailer 1,14,-9,36,US,130980218,10301883,802501,Song: Love Is A Long Road\nArtist: Tom Petty\n...,QdBZY2fkU-0,...,True,grand theft auto vi trailer,song love long road artist tom petty written t...,rockstar game grand theft auto vi gtavi gta gt...,rockstar game,2023,12,4,0,91
2,I Survived 7 Days In An Abandoned City,50,-29,0,US,99937514,4271480,160164,This was one of the hardest challenges weâve...,tWYsfOSY9vY,...,True,survived day abandoned city,one hardest challenge weve ever done deal good...,,mrbeast,2024,3,2,5,1044
3,Pushpa 2 The Rule Teaser | Allu Arjun | Sukuma...,32,-25,18,US,96150890,1438267,49291,Pushpa 2 The Rule Teaser on Mythri Movie Maker...,wboGYls1Bns,...,False,pushpa rule teaser allu arjun sukumar rashmika...,pushpa rule teaser mythri movie maker pushpa r...,pushpa rule teaser allu arjun sukumar rashmika...,mythri movie maker,2024,4,8,0,68
4,7 Days Stranded On An Island,46,-13,4,US,93831659,4313238,115574,I canât believe we actually did this\nSend m...,erLbbextvlY,...,True,day stranded island,cant believe actually send money around world ...,,mrbeast,2024,3,30,5,1346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10241,'I'm really proud' Clement beams with pride af...,42,8,8,GB,20924,507,84,The Rangers boss was speaking after watching h...,eXy3xlU0VEs,...,False,im really proud clement beam pride stunning wi...,ranger bos speaking watching team defeat real ...,,ranger review,2023,12,14,3,469
10242,Attempting the Vic High Country towing a trail...,49,1,1,AU,20842,1068,124,In this episode we attempt to tow our OPUS OP2...,yotJiZnjARY,...,False,attempting vic high country towing trailer tou...,episode attempt tow opus op across victorian h...,vhc victorianhighcountry vichighcountry wd x o...,tyler thompson,2024,1,27,5,2561
10243,"MOVING VLOG!! | livs new home, snow trip prep ...",50,0,0,AU,20458,723,14,Happy Sunday!!! \n\nShop MVP here:\nwww.myvint...,9MI-8DFw7AM,...,False,moving vlog livs new home snow trip prep life ...,happy sunday shop mvp wwwmyvintagepleasurecom ...,,mescia twin,2024,2,4,6,1563
10244,"""We Made It Hard For Ourselves"" | Mikel Arteta...",49,1,1,GB,20435,393,74,Mikel Arteta reacts to a dramatic win at Kenil...,sbxflsCYdlM,...,False,made hard mikel arteta post match reaction lut...,mikel arteta reacts dramatic win kenilworth ro...,amazon prime video sport premier league atp wt...,amazon prime video sport,2023,12,5,1,139


In [5]:
# Assuming `youtube` is your DataFrame
youtube = youtube.drop(columns=['title', 'description', 'channel_title', 'video_tags', 'daily_rank', 'daily_movement', 'weekly_movement', 'like_count', 'comment_count', 'video_id', 'channel_id', 'category_id'], axis=1)
youtube.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10246 entries, 0 to 10245
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   country                  10246 non-null  object
 1   view_count               10246 non-null  int64 
 2   langauge                 10246 non-null  object
 3   definition               10246 non-null  object
 4   caption_status           10246 non-null  bool  
 5   title_processed          10246 non-null  object
 6   description_processed    10246 non-null  object
 7   video_tags_processed     10246 non-null  object
 8   channel_title_processed  10246 non-null  object
 9   publish_year             10246 non-null  int32 
 10  publish_month            10246 non-null  int32 
 11  publish_day              10246 non-null  int32 
 12  publish_dayofweek        10246 non-null  int32 
 13  duration_seconds         10246 non-null  int64 
dtypes: bool(1), int32(4), int64(2), object

In [6]:
# Load the saved preprocessed data and model
with open('ridge_regression_model.pkl', 'rb') as file:
    pipeline = pickle.load(file)

# Split the data into training and testing sets
X = youtube.drop(columns=['view_count'])  # Features
y = np.log1p(youtube['view_count'])  # Log-transformed target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__alpha': [0.1, 1, 10, 100, 1000],
    'model__solver': ['auto', 'lsqr', 'sparse_cg', 'sag']
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
print("Performing hyperparameter tuning for Ridge Regression...")
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)  # RMSE

print(f"Best parameters: {best_params}")
print(f"Best RMSE: {best_score}")

# Predict on the test set with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = mse ** 0.5
print(f"Ridge Regression - Mean Squared Error: {mse}, R^2 Score: {r2}, RMSE: {rmse}")

Performing hyperparameter tuning for Ridge Regression...
Best parameters: {'model__alpha': 1, 'model__solver': 'lsqr'}
Best RMSE: 0.7897592154011399
Ridge Regression - Mean Squared Error: 0.5909419536956765, R^2 Score: 0.6118693582604899, RMSE: 0.7687274898790055


In [7]:
# Save the best model
with open('ridge_regression_best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
    print("Best Ridge Regression model saved as ridge_regression_best_model.pkl")

Best Ridge Regression model saved as ridge_regression_best_model.pkl
