In [8]:
#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.feature_extraction.text
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn import preprocessing, linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.datasets import fetch_20newsgroups, make_blobs
from sklearn.feature_extraction.text import TfidfVectorizer  #Feature extraction
from sklearn.naive_bayes import MultinomialNB #Our learner.
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, SelectFromModel
import pandas as pd

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
from matplotlib.colors import ListedColormap
import seaborn #Makes plots look nice, also heatmaps
import scipy as sp #for interp

%matplotlib inline

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas
import requests
import json
import math

In [2]:
df = pandas.read_pickle('cmv_full_features.pkl')
df = df.sample(frac = .1)
#splitting data
data_train, data_test = train_test_split(df, test_size=0.3, random_state=123)
data_train['is_train'] = True
data_test['is_train'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
#turn the training dataset into a tf-idf matrix
TFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=100, min_df=2, ngram_range=(1, 10),stop_words='english', norm='l2')
TFVects = TFVectorizer.fit_transform(data_train['com_text'])
TFVects.shape #(3836, 32241)
#print(TFVects.shape

(2685, 21708)

In [10]:
#Perform PCA on this matrix to pick significant features
#perform pca
SVD = TruncatedSVD(n_components=1000, random_state=123)
reduced_data = SVD.fit_transform(TFVects)
reduced_data.shape

numpy.ndarray

In [11]:
#combining tfidf features with liguistic features and clustering labels
tfdf = pd.DataFrame(reduced_data)
features_train = pd.concat([tfdf, data_train.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_train.set_index('index', inplace = True)

In [12]:
#turn the test dataset into a tf-idf
TFVects_test = TFVectorizer.transform(data_test['com_text'])
reduced_test = SVD.transform(TFVects_test)
tfdf_test = pd.DataFrame(reduced_test)
features_test = pd.concat([tfdf_test, data_test.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_test.set_index('index', inplace = True)

In [7]:
X = features_train
y = data_train['com_delta_received'].as_matrix()

# Idea 1: try classification for all posts

## Logistic regression

In [None]:
#combining tfidf features with liguistic features and clustering labels
tfdf = pd.DataFrame(TFVects.toarray())
features_train = pd.concat([tfdf, data_train.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_train.set_index('index', inplace = True)


(2685, 1000)

In [None]:
#use scree plot to determine the number of dimensions
n = TFVects.shape[0]
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
eigen_vals = np.arange(n) + 1
ax1.plot(eigen_vals, pca.explained_variance_ratio_, 'ro-', linewidth=2)
ax1.set_title('Scree Plot')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Proportion of Explained Variance')

ax2 = fig.add_subplot(122)
eigen_vals = np.arange(10) + 1
ax2.plot(eigen_vals, pca.explained_variance_ratio_[:10], 'ro-', linewidth=2)
ax2.set_title('Scree Plot (First 20 Principal Components)')
ax2.set_xlabel('Principal Component')
ax2.set_ylabel('Proportion of Explained Variance')
plt.show()

In [None]:
#predicting with first ten
X = reduced_data[:, :2]
Y = np.array([int(label) for label in data_train['com_delta_received']]) #Transform our predictor variable. 
              
#fitting logistic regresion
logistic = linear_model.LogisticRegression()
logistic.fit(X, Y)
print("This logistic model using top two components fits {} of our training set".format(logistic.score(X,Y)))


In [None]:
#turn the test dataset into a tf-idf
TFVects_test = TFVectorizer.transform(data_test['com_text'])
tfdf_test = pd.DataFrame(TFVects_test.toarray())
features_test = pd.concat([tfdf_test, data_test.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_test.set_index('index', inplace = True)
reduced_data_test = pca.transform(features_test)

In [None]:
X_test = reduced_data_test[:, :2]
Y_test = np.array([int(label) for label in data_test['com_delta_received']])
print("This logistic model using top ten components fits {} of our testing set".format(logistic.score(X_test, Y_test)))

In [None]:
#Evaluation stats
print("Precision:", sklearn.metrics.precision_score(Y, logistic.predict(X), average = 'weighted')) #precision
print("Recall:",sklearn.metrics.recall_score(Y, logistic.predict(X), average = 'weighted')) #recall
print("F-measure:",sklearn.metrics.f1_score(Y, logistic.predict(X), average = 'weighted')) #F-1 measure

In [None]:
#ROC curve
x, y, _ = sklearn.metrics.roc_curve(Y, logistic.predict_proba(X)[:,1])
roc_auc = sklearn.metrics.auc(x,y)

plt.figure()
plt.plot(x,y, color = 'darkorange', lw = 2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
#Problem: we lost track of the feature names after PCA

## Binary Naive Bayes

## Decision tree

In [None]:
tree = DecisionTreeClassifier(max_depth=4,random_state=0).fit(reduced_data, data_train['com_delta_received'])

In [None]:
labels = tree.predict(reduced_data_test)

In [None]:
mat = confusion_matrix(data_test['com_delta_received'], labels)
seaborn.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=['delta_not_received', 'delta_received'], yticklabels=['delta_not_received', 'delta_received'])
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
print('Precision: {}'.format(sklearn.metrics.precision_score(data_test['com_delta_received'], labels, average = 'weighted'))) 
print('Recall: {}'.format(sklearn.metrics.recall_score(data_test['com_delta_received'], labels, average = 'weighted'))) 
print('F1 Score: {}'.format(sklearn.metrics.f1_score(data_test['com_delta_received'], labels, average = 'weighted'))) 

labels = [1 if dr else 0 for dr in labels]
probs = tree.predict_proba(reduced_data_test)
print('AUC Score: {}'.format(sklearn.metrics.roc_auc_score(data_test['com_delta_received'], probs[:,1])))



## Neural Nets - multi-layer perceptron

In [24]:
#initialize the model
mlp_clf = MLPClassifier()

#fit the model
mlp_clf.fit(X, y)

#mlp_labels = [mlp_clf.predict(v)[0] for v in features_test]

ValueError: shapes (1,1) and (20055,100) not aligned: 1 (dim 1) != 20055 (dim 0)

In [None]:
#Confusion matrix
mat = confusion_matrix(data_test['com_delta_received'], mlp_labels)
seaborn.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=['delta_not_received', 'delta_received'], yticklabels=['delta_not_received', 'delta_received'])
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
#data_test['com_delta_received']
#mlp_labels

In [None]:
#ROC curve
x, y, _ = sklearn.metrics.roc_curve(data_test['com_delta_received'], mlp_labels)
roc_auc = sklearn.metrics.auc(x,y)

plt.figure()
plt.plot(x,y, color = 'darkorange', lw = 2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

# Idea 2: try classification sparately for each "cluster"

# Feature extraction

Idea: We realized that none of the implementations of the machine learning algorithms actually provides us with the list of features that are most reliable to classify comments. Thus, we are using feature extraction tools from sklearn.

Oh, we've observed that the features are very stable across algorithms, we could interpret these features by finding them in comments and see if these comments receive delta or not

## feature extraction using SelectKBest

In [14]:
X = features_train
y = data_train['com_delta_received'].as_matrix()

# def extract_f (func, k):
#     selector = sklearn.feature_selection.SelectKBest(func, k=k).fit(X,y)
#     X_new = selector.transform(X)
#     X_inverse = selector.inverse_transform(X_new)
#     X_inverse = pd.DataFrame(X_inverse)
    
#     l = X_inverse.sum(axis = 0)
#     feature_indices = []
#     for i in range(l.shape[0]):
#         if l[i] != 0:
#            feature_indices.append(i)
#     #feature_indices
#     selected_features = features_train.columns[feature_indices].get_values()
#     return selected_features

def extract_KBest(func, k):
    selector = SelectKBest(func, k=k).fit(X,y)
    feature_indices = selector.get_support(indices=True)
    
    selected_features = features_train.columns[feature_indices].get_values()
    scores = selector.scores_[feature_indices]
    if selector.pvalues_ != None:
        pvalues = selector.pvalues_[feature_indices]
    else:
        pvalues = None
    return (selected_features, scores, pvalues)
    


In [15]:
#f_classif
selected_features_f, scores_f, pvalues_f = extract_KBest(sklearn.feature_selection.f_classif, 20)
selected_features_f



array([384, 863, 2025, 4463, 5548, 6333, 6851, 7781, 8059, 8118, 9262,
       9302, 11482, 13125, 14954, 16534, 16912, 18420, 19393, 'com_upvotes'], dtype=object)

In [10]:
tf_features_indices_f = selected_features_f[:-1].astype(int)
all_features_f  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_f].tolist()
all_features_f.append(selected_features_f[-1])
#all_features_f
d = {'all_features_f': all_features_f, 'scores_f': scores_f, 'pvalues_f': pvalues_f}
F_f_DF = pd.DataFrame(data = d).sort(columns='scores_f', axis=0, ascending=False).reset_index(drop = True)
F_f_DF



Unnamed: 0,all_features_f,pvalues_f,scores_f
0,time police,5.388721e-22,94.587004
1,experts make,8.00642e-22,93.775769
2,interesting questions,9.942654e-22,93.332126
3,successes,1.954836e-21,91.94795
4,rug,2.828819e-21,91.1917
5,sealed,9.131385e-21,88.79542
6,procedural,2.887651e-20,86.443841
7,fine trying,3.101834e-20,86.297787
8,orthodoxy,3.9973389999999995e-20,85.780129
9,demands respect,8.612426999999999e-20,84.214341


In [49]:
#chi2 - we can't use this algo because some values of upvotes are negative


In [16]:
#mutual_info_classif
selected_features_m, scores_m, pvalues_m = extract_KBest(mutual_info_classif, 20)
selected_features_m

array([1123, 4898, 5642, 5680, 7562, 7903, 8589, 8751, 9462, 10929, 13489,
       14181, 14733, 16701, 16834, 17131, 18657, 20223, 21364,
       'com_upvotes'], dtype=object)

In [18]:
pvalues_m

In [19]:
tf_features_indices_m = selected_features_m[:-1].astype(int)
all_features_m  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_m].tolist()
all_features_m.extend(selected_features_m[-1:])
#all_features_m
d = {'all_features_m': all_features_m, 'scores_m': scores_m}
F_m_DF = pd.DataFrame(data = d).sort(columns='scores_m', axis=0, ascending=False).reset_index(drop = True)
F_m_DF



Unnamed: 0,all_features_m,scores_m
0,com_upvotes,0.01985
1,location,0.011513
2,say agree,0.011031
3,imaginable,0.010478
4,discriminates,0.010373
5,set aside,0.010006
6,edit2,0.009973
7,isn state,0.009962
8,present problem,0.009773
9,people access,0.009716


In [20]:
F_m_DF

Unnamed: 0,all_features_m,scores_m
0,com_upvotes,0.01985
1,location,0.011513
2,say agree,0.011031
3,imaginable,0.010478
4,discriminates,0.010373
5,set aside,0.010006
6,edit2,0.009973
7,isn state,0.009962
8,present problem,0.009773
9,people access,0.009716


## Feature extraction using Tree-based feature selection

In [22]:
def extract_model(clf_fitted):
    feature_indices = SelectFromModel(clf_fitted, prefit=True).get_support(indices=True)
    try:
        score = clf_fitted.feature_importances_[feature_indices]
    except:
        score = None
    selected_features = features_train.columns[feature_indices].get_values()
    return (selected_features, score)

In [27]:
#Decision tree
tree_clf = DecisionTreeClassifier(max_depth=10,random_state=0).fit(X, y)

In [28]:
selected_features_t1, scores_t1 = extract_model(tree_clf)
selected_features_t1

array([20, 247, 863, 901, 3401, 4463, 4895, 5007, 5679, 6844, 6861, 7268,
       8130, 9928, 10243, 12130, 13125, 14597, 14971, 16465, 18334, 19975,
       'com_upvotes', 'JS'], dtype=object)

In [38]:
selected_features_t1.shape

(22,)

In [29]:
tf_features_indices_t1 = selected_features_t1[:-2].astype(int)
all_features_t1  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_t1].tolist()
all_features_t1.extend(selected_features_t1[-2:])
#all_features_t1

d = {'all_features_t1': all_features_t1, 'scores_t1': scores_t1}
F_t1_DF = pd.DataFrame(data = d).sort(columns='scores_t1', axis=0, ascending=False).reset_index(drop = True)
F_t1_DF.iloc[:20, :]



Unnamed: 0,all_features_t1,scores_t1
0,com_upvotes,0.111151
1,effective,0.088666
2,role,0.068996
3,mods,0.057495
4,ability,0.055968
5,fine,0.047784
6,als,0.04698
7,demands respect,0.046903
8,orthodoxy,0.046827
9,finland,0.04675


In [23]:
#Extra decision tree
#fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset 
#and use averaging to improve the predictive accuracy and control over-fitting
tree2_clf = ExtraTreesClassifier(n_estimators=250,random_state=0).fit(X, y)

In [24]:
selected_features_t2, scores_t2 = extract_model(tree2_clf)
#selected_features_t2[2700:]

In [25]:
selected_features_t2[2600:]

array([20754, 20761, 20770, 20802, 20804, 20811, 20812, 20815, 20816,
       20821, 20822, 20854, 20857, 20867, 20879, 20880, 20889, 20900,
       20915, 20916, 20918, 20924, 20934, 20939, 20942, 20950, 20967,
       20970, 20974, 20980, 20987, 20996, 21006, 21008, 21011, 21016,
       21020, 21035, 21046, 21063, 21076, 21101, 21120, 21122, 21123,
       21133, 21147, 21219, 21224, 21259, 21261, 21301, 21318, 21324,
       21334, 21338, 21392, 21399, 21403, 21405, 21406, 21416, 21469,
       21517, 21518, 21523, 21524, 21526, 21533, 21567, 21576, 21622,
       21644, 21649, 21655, 21662, 21682, 21684, 21687, 21688, 21702,
       21707, 21712, 21741, 21742, 21758, 'com_upvotes', 'KL', 'JS',
       'kmeans', 'com_avg_pt_depth'], dtype=object)

In [26]:
tf_features_indices_t2 = selected_features_t2[:-5].astype(int)
all_features_t2  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_t2].tolist()
all_features_t2.extend(selected_features_t2[-5:])
#all_features_t1

d = {'all_features_t2': all_features_t2, 'scores_t2': scores_t2}
F_t2_DF = pd.DataFrame(data = d).sort(columns='scores_t2', axis=0, ascending=False).reset_index(drop = True)
F_t2_DF.iloc[:20, :]



Unnamed: 0,all_features_t2,scores_t2
0,interesting questions,0.005571
1,demands respect,0.005413
2,experts make,0.004848
3,com_upvotes,0.004413
4,orthodoxy,0.004285
5,procedural,0.004112
6,fine trying,0.004042
7,rug,0.003975
8,als,0.003891
9,dysfunction,0.003768


In [31]:
F_DF = pd.concat([F_f_DF, F_m_DF, F_t1_DF.iloc[:20, :], F_t2_DF.iloc[:20, :]], axis =1)
F_DF

Unnamed: 0,all_features_f,pvalues_f,scores_f,all_features_m,scores_m,all_features_t1,scores_t1,all_features_t2,scores_t2
0,time police,5.388721e-22,94.587004,com_upvotes,0.01985,com_upvotes,0.111151,interesting questions,0.005571
1,experts make,8.00642e-22,93.775769,location,0.011513,effective,0.088666,demands respect,0.005413
2,interesting questions,9.942654e-22,93.332126,say agree,0.011031,role,0.068996,experts make,0.004848
3,successes,1.954836e-21,91.94795,imaginable,0.010478,mods,0.057495,com_upvotes,0.004413
4,rug,2.828819e-21,91.1917,discriminates,0.010373,ability,0.055968,orthodoxy,0.004285
5,sealed,9.131385e-21,88.79542,set aside,0.010006,fine,0.047784,procedural,0.004112
6,procedural,2.887651e-20,86.443841,edit2,0.009973,als,0.04698,fine trying,0.004042
7,fine trying,3.101834e-20,86.297787,isn state,0.009962,demands respect,0.046903,rug,0.003975
8,orthodoxy,3.9973389999999995e-20,85.780129,present problem,0.009773,orthodoxy,0.046827,als,0.003891
9,demands respect,8.612426999999999e-20,84.214341,people access,0.009716,finland,0.04675,dysfunction,0.003768


In [48]:
## Feature extraction using Neural nets - mlp

# log_clf = linear_model.LogisticRegression()
# log_clf.fit(X, y)
# selected_features_log, scores_log = extract_model(log_clf)
# selected_features_log.shape
# selected_features_log[4500:]
# tf_features_indices_log = selected_features_log[:-1].astype(int)
# all_features_log = np.array(TFVectorizer.get_feature_names())[tf_features_indices_log].tolist()
# all_features_log.extend(selected_features_log[-1:])
#all_features_t1

# d = {'all_features_log': all_features_log, 'scores_log': scores_log}
# F_log_DF = pd.DataFrame(data = d).sort(columns='scores_log', axis=0, ascending=False).reset_index(drop = True)
# F_log_DF.iloc[:20, :]
#log_decisionDF = pd.DataFrame(log_clf.decision_function(X))#.sort(columns = [0], ascending = False)
#log_clf.densify(）