In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import isodate
import joblib

# Download stopwords
nltk.download('stopwords')

# Load the new dataset
file_path = 'new_youtube_video.csv'
youtube_data = pd.read_csv(file_path)

# Display first few rows to understand the structure
youtube_data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samson.afolabi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,video_id,title,description,tags,category_id,view_count,like_count,dislike_count,comment_count,duration,...,channel_description,published_at,channel_creation_date,duration_seconds,weighted_views,length_category,upload_day,upload_hour,title_length,description_length
0,2XLnR2HlmBU,Compact Verbot: Faeser massiv unter Druck!,Prof. Vosgerau übt massive Kritik an Faesers C...,[],27,71231,11255,0,953,PT10M32S,...,"Ich erstelle Videos über Dinge, die mich beweg...",2013-11-28T10:40:22Z,2013-11-28T10:40:22Z,,,,,,,
1,SjnWd_j7wZQ,ن سوشل میڈیا کی PTI خاتون MNA کی جعلی گندی وڈی...,Today's Punjabi Vlog: https://www.youtube.com/...,"['shahbaz gill', 'shehbaz gill', 'urdu vlog', ...",25,191438,22657,0,2894,PT25M17S,...,This channel covers News & Current Affairs\n\n...,2019-06-02T20:41:32Z,2019-06-02T20:41:32Z,,,,,,,
2,XFTuRBkb8r8,Thailand Vlog / DAY 7,,[],1,30639,7874,0,147,PT1M1S,...,Hey there!\nMy name is Nirami :) \nI´m gonna t...,2012-07-30T09:27:07Z,2012-07-30T09:27:07Z,,,,,,,
3,ahIN1hCdjeQ,Buying a Gaming PC from Facebook Marketplace I...,Check out the ZimaBlade below!\nZimaBlade Offi...,[],28,43212,7229,0,411,PT11M30S,...,"Hi, I'm Andy! I own a computer repair shop in ...",2014-10-15T21:30:10Z,2014-10-15T21:30:10Z,,,,,,,
4,IM7SROtdvlo,Drivers stuck between L.A. and Las Vegas as I-...,An overturned big rig and resulting hazmat sit...,"['video', 'news']",25,152033,1427,0,657,PT5M2S,...,"KTLA 5 in Los Angeles covers breaking news, we...",2006-06-13T05:19:22Z,2006-06-13T05:19:22Z,,,,,,,


In [2]:
# Function to detect Arabic script in text
def contains_arabic(text):
    arabic_characters = re.compile("[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")
    return arabic_characters.search(text) is not None

# Remove rows with Arabic titles
youtube_data = youtube_data[~youtube_data['title'].apply(contains_arabic)]

In [3]:
# Remove rows with missing target values
youtube_data = youtube_data.dropna(subset=['view_count'])

# Fill missing values for text fields with empty string
youtube_data['title'] = youtube_data['title'].fillna('')
youtube_data['description'] = youtube_data['description'].fillna('')
youtube_data['tags'] = youtube_data['tags'].fillna('')

# Convert duration to seconds
def duration_to_seconds(duration):
    try:
        duration = isodate.parse_duration(duration)
        return duration.total_seconds()
    except:
        return np.nan

In [4]:

youtube_data['duration_seconds'] = youtube_data['duration'].apply(duration_to_seconds)

# Convert channel_creation_date to the number of days since channel creation
youtube_data['channel_creation_date'] = pd.to_datetime(youtube_data['channel_creation_date']).dt.tz_localize(None)
youtube_data['days_since_channel_creation'] = (pd.Timestamp.now().normalize() - youtube_data['channel_creation_date']).dt.days

# Ensure 'is_for_kids' is binary
youtube_data['is_for_kids'] = youtube_data['is_for_kids'].astype(int)

In [5]:
# Select features that are available at the time of video creation
features = ['title', 'description', 'tags', 'category_id', 'category_name', 'duration_seconds', 
            'channel_video_count', 'days_since_channel_creation', 'is_for_kids', 'subscriber_count']
target = 'view_count'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(youtube_data[features], youtube_data[target], test_size=0.2, random_state=42)

# Function to preprocess text (remove special characters, tokenize, remove stopwords)
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'\W', ' ', text.lower())
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [6]:
X_train['title'] = X_train['title'].apply(preprocess)
X_train['description'] = X_train['description'].apply(preprocess)
X_train['tags'] = X_train['tags'].apply(preprocess)
X_test['title'] = X_test['title'].apply(preprocess)
X_test['description'] = X_test['description'].apply(preprocess)
X_test['tags'] = X_test['tags'].apply(preprocess)

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('title', TfidfVectorizer(max_features=5000), 'title'),
        ('description', TfidfVectorizer(max_features=5000), 'description'),
        ('tags', TfidfVectorizer(max_features=5000), 'tags'),
        #('category_id', OneHotEncoder(), ['category_id']),
        ('category_name', OneHotEncoder(), ['category_name']),
        ('duration', SimpleImputer(strategy='median'), ['duration_seconds']),
        ('channel_video_count', SimpleImputer(strategy='median'), ['channel_video_count']),
        ('days_since_channel_creation', SimpleImputer(strategy='median'), ['days_since_channel_creation']),
        ('is_for_kids', 'passthrough', ['is_for_kids']),
        ('subscriber_count', SimpleImputer(strategy='median'), ['subscriber_count'])
    ], sparse_threshold=0)  # Ensure dense output

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [7]:
# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 3115584.241200084


In [9]:
# Save the model
joblib.dump(model, 'youtube_view_predictor.pkl')


['youtube_view_predictor.pkl']

In [10]:
# Example of new video data
new_video = pd.DataFrame({
    'title': ['New Video Title'],
    'description': ['This is a description of the new video.'],
    'tags': ['tag1 tag2 tag3'],
    'category_id': [24],
    'category_name': ['Entertainment'],
    'duration_seconds': [300],
    'channel_video_count': [150],
    'days_since_channel_creation': [(pd.Timestamp.now() - pd.Timestamp('2020-01-01')).days],
    'is_for_kids': [0],
    'subscriber_count': [100000]
})

# Preprocess the new video data
new_video['title'] = new_video['title'].apply(preprocess)
new_video['description'] = new_video['description'].apply(preprocess)
new_video['tags'] = new_video['tags'].apply(preprocess)

# Predict views
predicted_views = model.predict(new_video)
print(f'Predicted Views: {predicted_views[0]:,.0f}')

Predicted Views: 512,736
