In [35]:
import json
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import wikipedia
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import re
from pandas.plotting import register_matplotlib_converters
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

In [36]:
vids = pd.read_csv('files/df_sans_zero_sentiments.csv')
#filter out outliers with not many comments (not enough comments for sentiment analysis to be robust)
print("dataframe before filtering: ",vids.shape)
for col in vids.columns[3:]:
    if str(vids[col].dtype) != 'object':
        vids = vids[(np.abs(stats.zscore(vids[col])) < 3)]
vids = vids[vids['commentCount'] > 100]
vids = vids.drop(['Unnamed: 0'],axis=1)
vids.index = pd.to_datetime(vids.date)
#6 equally distributed categories of likes/dislikes ratio
bin_labels = [0,1,2,3,4,5]
vids['ratio_bins'] = pd.qcut(vids['ratio'], q=6, labels = bin_labels)
print("dataframe shape after filtering outliers and low comment rows: ",vids.shape)
vids = vids[vids.title.duplicated()==False]
print("dataframe shape after removing duplicates",vids.shape)

dataframe before filtering:  (1394, 23)
dataframe shape after filtering outliers and low comment rows:  (1088, 23)
dataframe shape after removing duplicates (1083, 23)


In [37]:
drop_parameters = ['date','title','ratio','ratio_bins','dislikeCount','likeCount','magMax','viewCount','commentCount']
X = vids.drop(drop_parameters,axis=1)
y = vids['ratio_bins']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=42)

In [38]:
clf = GaussianNB()
knc = KNeighborsClassifier()
dt = DecisionTreeClassifier()
nnw = MLPClassifier()
svm = SVC(gamma='scale')

In [39]:
models = [clf, knc, dt, nnw, svm]
accuracy = []
for i in range(len(models)):
    models[i].fit(X_train, y_train)
    #y_train_pred = models[i].predict(X_train)
    y_test_pred = models[i].predict(X_test)
    #print("Accuracy on train: ", accuracy_score(y_train, y_train_pred))
    print(models[i])
    print("Accuracy on test: ", accuracy_score(y_test, y_test_pred))
    accuracy.append(accuracy_score(y_test, y_test_pred))
    #print("Confusion matrix on train set: ")
    #print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    print("Confusion matrix on test set: ")
    print(confusion_matrix(y_test, y_test_pred))
    print("\n")

GaussianNB(priors=None, var_smoothing=1e-09)
Accuracy on test:  0.34101382488479265


Confusion matrix on test set: 
[[26  9  2  1  3  0]
 [ 9 12  1  5  2  5]
 [ 5  7  0  5  6  8]
 [ 6 10  1  7  2  9]
 [ 4  4  1  7  7 11]
 [ 4  4  2  5  5 22]]


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy on test:  0.19815668202764977


Confusion matrix on test set: 
[[18  7  3  4  7  2]
 [ 9  8  9  3  2  3]
 [ 8  9  5  3  1  5]
 [ 6  7  5  6  7  4]
 [ 7  7  3  8  2  7]
 [ 9  5  8  6 10  4]]


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                    

In [40]:
models = [clf, knc, dt, nnw, svm]
accuracy = []
for i in range(1, 31, 2):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train, y_train)
    #y_train_pred = models[i].predict(X_train)
    y_test_pred = model.predict(X_test)
    #print("Accuracy on train: ", accuracy_score(y_train, y_train_pred))
    print(model)
    print("Accuracy on test: ", accuracy_score(y_test, y_test_pred))
    accuracy.append(accuracy_score(y_test, y_test_pred))
    #print("Confusion matrix on train set: ")
    #print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    #print("Confusion matrix on test set: ")
    #print(confusion_matrix(y_test, y_test_pred))
    print("\n")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Accuracy on test:  0.20276497695852536




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
Accuracy on test:  0.2350230414746544




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy on test:  0.19815668202764977




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
Accuracy on test:  0.1935483870967742




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
         

In [41]:
rbf = {'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001],'C': [0.001, 0.01, 0.1, 1, 10, 100, 100]}
    
linear = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
    
tuned_parameters = [linear, rbf]
acc_scorer = make_scorer(accuracy_score)
grid_obj  = GridSearchCV(SVC(), tuned_parameters, cv=10, scoring=acc_scorer, n_jobs=-1, verbose = 5)
grid_obj  = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
print('best clf:', clf)
  
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred, average='macro'))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print(confusion_matrix(y_test, y_pred))

Fitting 10 folds for each of 34 candidates, totalling 340 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 340 out of 340 | elapsed:  3.3min finished


best clf: SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
Accuracy:  0.35023041474654376
F1:  0.33769549471503985
Precision:  0.33912702813912493
Recall:  0.3412479127502037
[[24  6  3  3  4  1]
 [ 8  9  3  4  7  3]
 [ 2  6 11  3  4  5]
 [ 3  5  7  4 11  5]
 [ 3  4  4  7 11  5]
 [ 4  1  4  5 11 17]]


In [None]:
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5)
# clf = GridSearchCV(svm, parameters, cv=5)
# clf.fit(X_train, y_train)

In [32]:
clf = LogisticRegressionCV(cv=10, verbose=2, n_jobs=-1, random_state=0, multi_class='auto').fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.5s finished


In [42]:
predict = clf.predict(X_test)

In [43]:
accuracy_score(y_test, predict)

0.35023041474654376