In [9]:
import json
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import wikipedia
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import re
from pandas.plotting import register_matplotlib_converters
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [10]:
vids = pd.read_csv('files/df_sans_zero_sentiments.csv')
#filter out outliers with not many comments (not enough comments for sentiment analysis to be robust)
print("dataframe before filtering: ",vids.shape)
for col in vids.columns[3:]:
    if str(vids[col].dtype) != 'object':
        vids = vids[(np.abs(stats.zscore(vids[col])) < 3)]
vids = vids[vids['commentCount'] > 100]
vids = vids.drop(['Unnamed: 0'],axis=1)
vids.index = pd.to_datetime(vids.date)
#6 equally distributed categories of likes/dislikes ratio
bin_labels = [0,1,2,3,4,5]
vids['ratio_bins'] = pd.qcut(vids['ratio'], q=6, labels = bin_labels)
print("dataframe shape after filtering outliers and low comment rows: ",vids.shape)
vids = vids[vids.title.duplicated()==False]
print("dataframe shape after removing duplicates",vids.shape)

dataframe before filtering:  (1394, 23)
dataframe shape after filtering outliers and low comment rows:  (1088, 23)
dataframe shape after removing duplicates (1083, 23)


In [11]:
drop_parameters = ['date','title','ratio','ratio_bins','dislikeCount','likeCount']
X = vids.drop(drop_parameters,axis=1)
y = vids['ratio_bins']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=42)

In [12]:
clf = GaussianNB()
knc = KNeighborsClassifier()
dt = DecisionTreeClassifier()
nnw = MLPClassifier()
svm = SVC(gamma='scale')

In [13]:
models = [clf, knc, dt, nnw, svm]
accuracy = []
for i in range(len(models)):
    models[i].fit(X_train, y_train)
    #y_train_pred = models[i].predict(X_train)
    y_test_pred = models[i].predict(X_test)
    #print("Accuracy on train: ", accuracy_score(y_train, y_train_pred))
    print(models[i])
    print("Accuracy on test: ", accuracy_score(y_test, y_test_pred))
    accuracy.append(accuracy_score(y_test, y_test_pred))
    #print("Confusion matrix on train set: ")
    #print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    print("Confusion matrix on test set: ")
    print(confusion_matrix(y_test, y_test_pred))
    print("\n")

GaussianNB(priors=None, var_smoothing=1e-09)
Accuracy on test:  0.2903225806451613


Confusion matrix on test set: 
[[17 10  0  0  0 14]
 [ 5 11  1  0  0 17]
 [ 0  5  6  0  2 18]
 [ 5 11  2  0  0 17]
 [ 5  5  2  2  0 20]
 [ 3  7  0  1  2 29]]


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy on test:  0.1889400921658986


Confusion matrix on test set: 
[[15  7  8  1  4  6]
 [ 9  8  1  7  2  7]
 [ 9  5  7  4  4  2]
 [ 9  5  6  3  5  7]
 [ 6  5  7  9  3  4]
 [ 8  7  7  8  7  5]]


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                      



In [26]:
models = [clf, knc, dt, nnw, svm]
accuracy = []
for i in range(1, 31, 2):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train, y_train)
    #y_train_pred = models[i].predict(X_train)
    y_test_pred = model.predict(X_test)
    #print("Accuracy on train: ", accuracy_score(y_train, y_train_pred))
    print(model)
    print("Accuracy on test: ", accuracy_score(y_test, y_test_pred))
    accuracy.append(accuracy_score(y_test, y_test_pred))
    #print("Confusion matrix on train set: ")
    #print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    #print("Confusion matrix on test set: ")
    #print(confusion_matrix(y_test, y_test_pred))
    print("\n")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Accuracy on test:  0.15207373271889402




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
Accuracy on test:  0.17050691244239632




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy on test:  0.1889400921658986




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
Accuracy on test:  0.17511520737327188




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
        

In [30]:
rbf = {'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001],'C': [0.001, 0.01, 0.1, 1, 10, 100, 100]}
    
linear = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
    
tuned_parameters = [linear, rbf]
acc_scorer = make_scorer(accuracy_score)
grid_obj  = GridSearchCV(SVC(), tuned_parameters, cv=2, scoring=acc_scorer, n_jobs=-1, verbose = 5)
grid_obj  = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
print('best clf:', clf)
  
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred, average='macro'))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print(confusion_matrix(y_test, y_pred))

Fitting 2 folds for each of 34 candidates, totalling 68 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5)
# clf = GridSearchCV(svm, parameters, cv=5)
# clf.fit(X_train, y_train)