# Task for Today  

***

## YouTube Subscriber Count Prediction  
  
Given *data about Data Every Day YouTube videos*, let's try to predict the **number of subscribers** that will be generated from a given video.  
  
We will use a random forest regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv('../input/data-every-day-youtube-episodes/youtube_data.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['Video', 'Video title'], axis=1)
    
    # Drop row with missing target value
    missing_target_row = df[df['Subscribers'].isna()].index
    df = df.drop(missing_target_row, axis=0).reset_index(drop=True)
    
    # Extract date features
    df['Video publish time'] = pd.to_datetime(df['Video publish time'])
    df['Video month'] = df['Video publish time'].apply(lambda x: x.month)
    df['Video day'] = df['Video publish time'].apply(lambda x: x.day)
    df = df.drop('Video publish time', axis=1)
    
    # Convert durations to seconds
    df['Average view duration'] = pd.to_datetime(df['Average view duration']).apply(lambda x: (x.minute * 60) + x.second)
    
    # Split df into X and y
    y = df['Subscribers']
    X = df.drop('Subscribers', axis=1)
    
    return X, y

In [None]:
X, y = preprocess_inputs(data)

In [None]:
X

In [None]:
y

# Training/Validation

In [None]:
rmses = []
r2s = []

kf = KFold(n_splits=5)

for train_idx, test_idx in kf.split(X):
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    rmses.append(rmse)
    
    r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
    r2s.append(r2)

# Results

In [None]:
print("     RMSE: {:.2f}".format(np.mean(rmses)))
print("R^2 Score: {:.5f}".format(np.mean(r2s)))

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x=y_pred, y=y_test)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xlim(0, 15)
plt.ylim(0, 15)
plt.title("Actual vs. Predicted Values")
plt.show()

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/ojJDJYsqRpc