In [48]:
import json
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import wikipedia
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import re
from pandas.plotting import register_matplotlib_converters
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [49]:
raw_scores = pd.read_csv('data/raw_scores.csv')
raw_scores = raw_scores.drop(['Unnamed: 0'],axis=1)
print(raw_scores.shape)
raw_scores.head()

(3525956, 3)


Unnamed: 0,idx,magnitude,sentiment
0,0,18.67,0.06
1,0,14.38,0.35
2,0,48.22,0.07
3,0,32.27,0.25
4,0,61.94,-0.19


The raw_scores dataframe is a dataframe of all the magnitude and sentiment scores for all of our comments. each comment had an idx column entry to associate it to an index of our videos.

In [50]:
len(raw_scores[(raw_scores.magnitude == 0) & (raw_scores.sentiment == 0)])

294856

nearly 300,000 comments with 0 sentiment and 0 magnitude. this is due to them being too short usually. sometimes because the characters were non readable (emojis and such), or spelling mistakes. We remove these from our dataframe.

In [51]:
raw_scores = raw_scores[(raw_scores.magnitude != 0) & (raw_scores.sentiment != 0)]
len(raw_scores)

3197519

In [52]:
groups = raw_scores.groupby(['idx']).describe()
groups.head()

Unnamed: 0_level_0,magnitude,magnitude,magnitude,magnitude,magnitude,magnitude,magnitude,magnitude,sentiment,sentiment,sentiment,sentiment,sentiment,sentiment,sentiment,sentiment
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
idx,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,1486.0,36.839603,27.999185,0.16,15.4025,30.98,52.135,173.46,1486.0,0.202423,0.210985,-0.8,0.1,0.22,0.33,0.75
1,286.0,40.991923,32.780557,1.23,15.4375,33.695,56.9025,187.24,286.0,0.156189,0.228803,-0.64,0.07,0.175,0.3075,0.72
2,250.0,37.66192,27.948382,0.28,16.5125,30.94,55.65,123.39,250.0,0.17456,0.229152,-0.67,0.06,0.18,0.3,0.75
3,51.0,54.128431,46.778562,1.88,20.765,45.49,68.045,234.21,51.0,0.11902,0.262741,-0.75,0.015,0.16,0.25,0.72
4,45.0,37.703111,29.19318,1.88,16.15,28.63,57.57,132.69,45.0,0.186889,0.23063,-0.34,0.07,0.16,0.32,0.68


#### We group the sentiments and magnitudes according to the column 'idx'. idx represents the video index in the video dataframe. idx = 0 is the first video in the vids dataframe, so all of those sentiments are for all of the comments of videos 0. We append the descriptive stats of the sentiments and magnitudes to the videos dataframe as features to use in our analysis.

In [53]:
vids = pd.read_csv('files/labeled_vids.csv')
vids = vids.drop(['Unnamed: 0','Unnamed: 0.1','sentiment','magnitude'],axis=1)
print(vids.shape)
vids.head()

(1394, 8)


Unnamed: 0,date,title,class,viewCount,commentCount,likeCount,dislikeCount,ratio
0,2013-01-17,Joe Rogan Experience #1 - Brian Redban,2,673255,2091,6485,175,37.057143
1,2013-01-17,Joe Rogan Experience #2 - Brian Redban,2,134693,359,1004,68,14.764706
2,2013-01-17,Joe Rogan Experience #3 - Ari Shaffir,2,133176,320,1016,66,15.393939
3,2013-01-17,Joe Rogan Experience #5 - Ari Shaffir & John H...,2,49565,59,343,19,18.052632
4,2013-01-18,Joe Rogan Experience #5 - Ari Shaffir & John H...,2,33535,48,278,11,25.272727


In [54]:
label_list = ['mean','std','min','25%','50%','75%','max']
magnitude_label_list = ['magMean','magStd','magMin','magLq','magMedian','magUq','magMax']
sentiment_label_list = ['sentMean','sentStd','sentMin','sentLq','sentMedian','sentUq','sentMax']
for i,label in enumerate(label_list):
    vids[magnitude_label_list[i]] = groups['magnitude'][label]
    vids[sentiment_label_list[i]] = groups['sentiment'][label]

In [55]:
vids.head()


Unnamed: 0,date,title,class,viewCount,commentCount,likeCount,dislikeCount,ratio,magMean,sentMean,...,magMin,sentMin,magLq,sentLq,magMedian,sentMedian,magUq,sentUq,magMax,sentMax
0,2013-01-17,Joe Rogan Experience #1 - Brian Redban,2,673255,2091,6485,175,37.057143,36.839603,0.202423,...,0.16,-0.8,15.4025,0.1,30.98,0.22,52.135,0.33,173.46,0.75
1,2013-01-17,Joe Rogan Experience #2 - Brian Redban,2,134693,359,1004,68,14.764706,40.991923,0.156189,...,1.23,-0.64,15.4375,0.07,33.695,0.175,56.9025,0.3075,187.24,0.72
2,2013-01-17,Joe Rogan Experience #3 - Ari Shaffir,2,133176,320,1016,66,15.393939,37.66192,0.17456,...,0.28,-0.67,16.5125,0.06,30.94,0.18,55.65,0.3,123.39,0.75
3,2013-01-17,Joe Rogan Experience #5 - Ari Shaffir & John H...,2,49565,59,343,19,18.052632,54.128431,0.11902,...,1.88,-0.75,20.765,0.015,45.49,0.16,68.045,0.25,234.21,0.72
4,2013-01-18,Joe Rogan Experience #5 - Ari Shaffir & John H...,2,33535,48,278,11,25.272727,37.703111,0.186889,...,1.88,-0.34,16.15,0.07,28.63,0.16,57.57,0.32,132.69,0.68


In [56]:
vids.max()

date                                                   2019-10-01
title           Joe Rogan Experience - UFC Recap w/ Brendan Sc...
class                                                           5
viewCount                                                26071198
commentCount                                               123027
likeCount                                                  530844
dislikeCount                                                86953
ratio                                                         inf
magMean                                                   77.5369
sentMean                                                 0.362857
magStd                                                    70.8311
sentStd                                                   0.41872
magMin                                                       37.1
sentMin                                                      0.11
magLq                                                       46.72
sentLq    

In [57]:
print(vids.iloc[119].ratio)
print(vids.iloc[119].likeCount)
print(vids.iloc[119].dislikeCount)
vids.at[119,'dislikeCount'] = 1
vids.at[119,'ratio'] = 62

inf
62
0


In [58]:
vids.ratio.max()

209.0

We see there is an inf value in our ratio column. We find the culprit and deal with the issue. The issue is that the ratio is derived from dividing the likeCount by the dislikeCount. Any number divided by 0 results in an inf value. This inf value causes problems when analysing the data - many methods do not run if there is an inf value in the column. we deal with this by changing the dislikeCount to 1. No inf value any longer.

In [59]:
vids.iloc[119]

date                                                   2013-01-23
title           Joe Rogan Experience #140 - Brendon Walsh (Par...
class                                                           2
viewCount                                                    5667
commentCount                                                    4
likeCount                                                      62
dislikeCount                                                    1
ratio                                                          62
magMean                                                   18.8533
sentMean                                                 0.213333
magStd                                                    8.77474
sentStd                                                  0.405134
magMin                                                       8.85
sentMin                                                     -0.24
magLq                                                      15.655
sentLq    

In [65]:
vids[vids.title.duplicated()==True]

Unnamed: 0_level_0,date,title,class,viewCount,commentCount,likeCount,dislikeCount,ratio,magMean,sentMean,...,sentMin,magLq,sentLq,magMedian,sentMedian,magUq,sentUq,magMax,sentMax,ratio_bins
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [61]:
print("Shape before removing duplicates: ",vids.shape)
vids = vids[vids.title.duplicated()==False]
print("Shape after removing duplicates: ",vids.shape)

Shape before removing duplicates:  (1394, 22)
Shape after removing duplicates:  (1386, 22)


Because of the way we got our video ID to query the youtube API with, we have some duplicate rows. We remove the duplicated rows from the dataframe.

In [62]:
for col in vids.columns[3:]:
    if str(vids[col].dtype) != 'object':
        vids = vids[(np.abs(stats.zscore(vids[col])) < 3)]
vids = vids[vids['commentCount'] > 100]
vids.index = pd.to_datetime(vids.date)
print("Shape after removing outliers",vids.shape)

Shape after removing outliers (1077, 22)


In [63]:
#6 equally distributed categories of likes/dislikes ratio
bin_labels = [0,1,2,3,4,5]
vids['ratio_bins'] = pd.qcut(vids['ratio'], q=6, labels = bin_labels)

We removed rows where columns were outside of 3 standard deviations. This is to reduce any skewing that might occur from outliers. We also decided to select videos with over 100 comments, because we feel if the amount of comments is too small, then we wont get robust results from the sentiment analysis of the comments. We also created a categorical version of our ratio column, for classification modelling later.