In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from xgboost import XGBClassifier
from datetime import datetime

# Data Loading

In [2]:
videos_df = pd.read_pickle('US_trending.pkl')

# filtriranje datuma do 1.4.2020. (korona)
videos_df = videos_df[videos_df['trending_date'] < datetime(2020, 4, 1)]

In [3]:
videos_df.head()

Unnamed: 0,video_id,title,published_at,channel_id,channel_title,category_id,trending_date,tags,view_count,likes,...,days_in_trending,category_name,publish_to_trend,publishing_hour,publishing_day,disc_likes,positive_sentiment,negative_sentiment,neutral_sentiment,analyzed_comments
0,vPx6M7eTYbc,Spill Your Guts: Harry Styles & Kendall Jenner,2019-12-11 07:08:34,UCJ0uqCI0Vqr2Rrt1HseGirg,The Late Late Show with James Corden,24,2019-12-12,The Late Late Show|Late Late Show|James Corden...,11636632,571835,...,1,Entertainment,1,7,Wednesday,3.0,0.46875,0.13125,0.4,160.0
1,sg8WaeeFyNY,WE GOT UNEXPECTED NEWS..,2019-12-12 05:25:42,UCDSJCBYqL7VQrlXfhr1RtwA,Les Do Makeup,26,2019-12-12,[none],423215,52780,...,7,How-to & Style,0,5,Thursday,3.0,0.75,0.075,0.175,40.0
2,q1PR05q8l2g,"Last To Miss Layup Wins $10,000",2019-12-11 23:00:53,UCQIUhhcmXsu6cN6n3y9-Pww,Jesser,17,2019-12-12,last to leave|nba basketball|nba basketball hi...,463685,20178,...,7,Sport,1,23,Wednesday,2.0,0.318182,0.181818,0.5,88.0
3,t6Z6RIXq0L0,LAKERS at MAGIC | FULL GAME HIGHLIGHTS | Decem...,2019-12-12 02:35:33,UCWJ2lWNubArHWmf3FIHbfcQ,NBA,17,2019-12-12,sp:ty=high|sp:dt=2019-12-12T00:00:00Z|sp:st=ba...,659579,4605,...,2,Sport,0,2,Thursday,1.0,0.462069,0.193103,0.344828,145.0
4,TGDpRB4ovvA,TIPPING DRIVE-THRU WORKERS $100 FOR THE HOLIDA...,2019-12-12 02:38:37,UCtj45MepAoKxZoyR_Mnt86Q,Royal Family,24,2019-12-12,queen Naija|Medicine|Queen|Spicy|Royalty Squad...,175558,18400,...,6,Entertainment,0,2,Thursday,2.0,0.63125,0.13125,0.2375,160.0


## Dealing with missing values

In [4]:
# ovde mozda razmisliti o necemu boljem (srednja vrednost po kategoriji i slicno)
videos_df['positive_sentiment'] = videos_df['positive_sentiment'].fillna(videos_df['positive_sentiment'].mean())
videos_df['negative_sentiment'] = videos_df['negative_sentiment'].fillna(videos_df['negative_sentiment'].mean())
videos_df['neutral_sentiment'] = videos_df['neutral_sentiment'].fillna(videos_df['neutral_sentiment'].mean())
videos_df.isna().sum()

video_id               0
title                  0
published_at           0
channel_id             0
channel_title          0
category_id            0
trending_date          0
tags                   0
view_count             0
likes                  0
dislikes               0
comment_count          0
thumbnail_link         0
comments_disabled      0
ratings_disabled       0
description           20
days_in_trending       0
category_name          0
publish_to_trend       0
publishing_hour        0
publishing_day         0
disc_likes             0
positive_sentiment     0
negative_sentiment     0
neutral_sentiment      0
analyzed_comments      0
dtype: int64

### Scale numerical features

In [5]:
numerical_features = ['view_count', 'dislikes', 'comment_count', 'positive_sentiment', 'negative_sentiment']
scaler = StandardScaler()
videos_df[numerical_features] = scaler.fit_transform(videos_df[numerical_features])

### X and y arrays

In [6]:
X = videos_df[['view_count', 'dislikes', 'comment_count', 'positive_sentiment', 'negative_sentiment', 'category_id']]
y = videos_df['disc_likes']

### One Hot Encoding for category feature

In [7]:
#onehotencoder = OneHotEncoder(categorical_features = [5], drop='first') 
#X = onehotencoder.fit_transform(X).toarray() 
#X.size

category_dummies = pd.get_dummies(videos_df['category_name'], drop_first=True)
X = pd.concat([X, category_dummies], axis=1)
X.head()

Unnamed: 0,view_count,dislikes,comment_count,positive_sentiment,negative_sentiment,category_id,Comedy,Education,Entertainment,Film & Animation,Gaming,How-to & Style,Music,News & Politics,Non-profits & Activism,People & Blogs,Pets & Animals,Science & Technology,Sport,Travel & Events
0,4.398527,0.041458,0.550593,0.398854,-0.833451,24,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,-0.381521,-0.024048,0.262271,2.605829,-1.375825,26,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,-0.364269,-0.025685,-0.244681,-0.782658,-0.345862,17,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,-0.280764,-0.028293,-0.255611,0.346428,-0.237048,17,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,-0.487092,-0.029379,-0.204122,1.673995,-0.833451,24,0,0,1,0,0,0,0,0,0,0,0,0,0,0


### Split data into training and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

# Random Forest Classification

In [9]:
random_forest = RandomForestClassifier()

In [10]:
print('Optimizing parameters of random forest classification...')
print('start at: {}'.format(datetime.now().time()))

parameters = {'n_estimators': [200, 500],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : [5, 8, 10, 15, 20, 30],
              'criterion' :['gini', 'entropy']}

rand_forest_classifier = GridSearchCV(estimator=random_forest, param_grid=parameters, cv=10)
rand_forest_classifier.fit(X_train, y_train)

print('stop at: {}'.format(datetime.now().time()))
print('---------------------------------')

print(rand_forest_classifier.best_score_)
print(rand_forest_classifier.best_estimator_)

Optimizing parameters of random forest classification...
start at: 20:25:24.120893
stop at: 20:56:14.461801
---------------------------------
0.807997557997558
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [11]:
y_pred = rand_forest_classifier.predict(X_test)

In [12]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[234  42   1]
 [ 38 199  37]
 [  1  33 235]]
              precision    recall  f1-score   support

         1.0       0.86      0.84      0.85       277
         2.0       0.73      0.73      0.73       274
         3.0       0.86      0.87      0.87       269

    accuracy                           0.81       820
   macro avg       0.81      0.81      0.81       820
weighted avg       0.81      0.81      0.81       820



# Support Vector Classification

In [13]:
svc = SVC()

In [14]:
print('Optimizing parameters of support vector classification...')
print('start at: {}'.format(datetime.now().time()))

parameters = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf', 'poly']}

svm_classifier = GridSearchCV(estimator=svc, param_grid=parameters, cv=10, n_jobs=-1)
svm_classifier.fit(X_train, y_train)

print('stop at: {}'.format(datetime.now().time()))
print('---------------------------------')

print(svm_classifier.best_score_)
print(svm_classifier.best_estimator_)

Optimizing parameters of support vector classification...
start at: 21:04:53.090124
stop at: 22:30:45.704002
---------------------------------
0.7783882783882784
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [15]:
y_pred = svm_classifier.predict(X_test)

In [16]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[228  47   2]
 [ 50 191  33]
 [  2  47 220]]
              precision    recall  f1-score   support

         1.0       0.81      0.82      0.82       277
         2.0       0.67      0.70      0.68       274
         3.0       0.86      0.82      0.84       269

    accuracy                           0.78       820
   macro avg       0.78      0.78      0.78       820
weighted avg       0.78      0.78      0.78       820



# Extreme Gradient Boosting Classification

In [17]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600)

In [18]:
print('Optimizing parameters of extreme gradient boosting classification...')
print('start at: {}'.format(datetime.now().time()))

parameters = {'min_child_weight': [1, 5, 10],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]}

xgb_classifier = GridSearchCV(estimator=xgb, param_grid=parameters, cv=10, n_jobs=4)
xgb_classifier.fit(X_train, y_train)

print('stop at: {}'.format(datetime.now().time()))
print('---------------------------------')

print(xgb_classifier.best_score_)
print(xgb_classifier.best_estimator_)

Optimizing parameters of extreme gradient boosting classification...
start at: 22:33:43.243834
stop at: 01:16:57.313996
---------------------------------
0.8101343101343101
XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.02, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=600, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6,
              tree_method=None, validate_parameters=False, verbosity=None)


In [74]:
y_pred = xgb_classifier.predict(X_test)

In [75]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[207  32   3]
 [ 39 163  23]
 [  3  35 201]]
              precision    recall  f1-score   support

         1.0       0.83      0.86      0.84       242
         2.0       0.71      0.72      0.72       225
         3.0       0.89      0.84      0.86       239

    accuracy                           0.81       706
   macro avg       0.81      0.81      0.81       706
weighted avg       0.81      0.81      0.81       706

