In [1]:
#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.feature_extraction.text
import sklearn.decomposition
from sklearn import preprocessing, linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.datasets import fetch_20newsgroups, make_blobs
from sklearn.feature_extraction.text import TfidfVectorizer  #Feature extraction
from sklearn.naive_bayes import MultinomialNB #Our learner.
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, SelectFromModel
import pandas as pd

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
from matplotlib.colors import ListedColormap
import seaborn #Makes plots look nice, also heatmaps
import scipy as sp #for interp

%matplotlib inline

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas
import requests
import json
import math



In [2]:
df = pandas.read_pickle('cmv_full_features.pkl')
df = df.sample(frac = .1)
#splitting data
data_train, data_test = train_test_split(df, test_size=0.3, random_state=123)
data_train['is_train'] = True
data_test['is_train'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
#turn the training dataset into a tf-idf matrix
TFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_df=100, min_df=2, ngram_range=(1, 10),stop_words='english', norm='l2')
TFVects = TFVectorizer.fit_transform(data_train['com_text'])
TFVects.shape #(3836, 32241)
#print(TFVects.shape

(2685, 20050)

In [4]:
#combining tfidf features with liguistic features and clustering labels
tfdf = pd.DataFrame(TFVects.toarray())
features_train = pd.concat([tfdf, data_train.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_train.set_index('index', inplace = True)

In [23]:
#turn the test dataset into a tf-idf
TFVects_test = TFVectorizer.transform(data_test['com_text'])
tfdf_test = pd.DataFrame(TFVects_test.toarray())
features_test = pd.concat([tfdf_test, data_test.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_test.set_index('index', inplace = True)

In [22]:
X = features_train
y = data_train['com_delta_received'].as_matrix()

# Idea 1: try classification for all posts

## Logistic regression

In [None]:
#combining tfidf features with liguistic features and clustering labels
tfdf = pd.DataFrame(TFVects.toarray())
features_train = pd.concat([tfdf, data_train.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_train.set_index('index', inplace = True)


In [None]:
#Perform PCA on this matrix to pick significant features
#perform pca
PCA = sklearn.decomposition.PCA
pca = PCA().fit(features_train)
reduced_data = pca.transform(features_train)

In [None]:
#use scree plot to determine the number of dimensions
n = TFVects.shape[0]
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
eigen_vals = np.arange(n) + 1
ax1.plot(eigen_vals, pca.explained_variance_ratio_, 'ro-', linewidth=2)
ax1.set_title('Scree Plot')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Proportion of Explained Variance')

ax2 = fig.add_subplot(122)
eigen_vals = np.arange(10) + 1
ax2.plot(eigen_vals, pca.explained_variance_ratio_[:10], 'ro-', linewidth=2)
ax2.set_title('Scree Plot (First 20 Principal Components)')
ax2.set_xlabel('Principal Component')
ax2.set_ylabel('Proportion of Explained Variance')
plt.show()

In [None]:
#predicting with first ten
X = reduced_data[:, :2]
Y = np.array([int(label) for label in data_train['com_delta_received']]) #Transform our predictor variable. 
              
#fitting logistic regresion
logistic = linear_model.LogisticRegression()
logistic.fit(X, Y)
print("This logistic model using top two components fits {} of our training set".format(logistic.score(X,Y)))


In [None]:
#turn the test dataset into a tf-idf
TFVects_test = TFVectorizer.transform(data_test['com_text'])
tfdf_test = pd.DataFrame(TFVects_test.toarray())
features_test = pd.concat([tfdf_test, data_test.reset_index()[['index', 'com_upvotes', 'KL', 'JS', 'kmeans', 'com_avg_pt_depth']]], axis = 1, ignore_index = False)
features_test.set_index('index', inplace = True)
reduced_data_test = pca.transform(features_test)

In [None]:
X_test = reduced_data_test[:, :2]
Y_test = np.array([int(label) for label in data_test['com_delta_received']])
print("This logistic model using top ten components fits {} of our testing set".format(logistic.score(X_test, Y_test)))

In [None]:
#Evaluation stats
print("Precision:", sklearn.metrics.precision_score(Y, logistic.predict(X), average = 'weighted')) #precision
print("Recall:",sklearn.metrics.recall_score(Y, logistic.predict(X), average = 'weighted')) #recall
print("F-measure:",sklearn.metrics.f1_score(Y, logistic.predict(X), average = 'weighted')) #F-1 measure

In [None]:
#ROC curve
x, y, _ = sklearn.metrics.roc_curve(Y, logistic.predict_proba(X)[:,1])
roc_auc = sklearn.metrics.auc(x,y)

plt.figure()
plt.plot(x,y, color = 'darkorange', lw = 2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
#Problem: we lost track of the feature names after PCA

## Binary Naive Bayes

## Decision tree

In [None]:
tree = DecisionTreeClassifier(max_depth=4,random_state=0).fit(reduced_data, data_train['com_delta_received'])

In [None]:
labels = tree.predict(reduced_data_test)

In [None]:
mat = confusion_matrix(data_test['com_delta_received'], labels)
seaborn.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=['delta_not_received', 'delta_received'], yticklabels=['delta_not_received', 'delta_received'])
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
print('Precision: {}'.format(sklearn.metrics.precision_score(data_test['com_delta_received'], labels, average = 'weighted'))) 
print('Recall: {}'.format(sklearn.metrics.recall_score(data_test['com_delta_received'], labels, average = 'weighted'))) 
print('F1 Score: {}'.format(sklearn.metrics.f1_score(data_test['com_delta_received'], labels, average = 'weighted'))) 

labels = [1 if dr else 0 for dr in labels]
probs = tree.predict_proba(reduced_data_test)
print('AUC Score: {}'.format(sklearn.metrics.roc_auc_score(data_test['com_delta_received'], probs[:,1])))



## Neural Nets - multi-layer perceptron

In [24]:
#initialize the model
mlp_clf = MLPClassifier()

#fit the model
mlp_clf.fit(X, y)

mlp_labels = [mlp_clf.predict(v)[0] for v in features_test]

ValueError: shapes (1,1) and (20055,100) not aligned: 1 (dim 1) != 20055 (dim 0)

In [None]:
#Confusion matrix
mat = confusion_matrix(data_test['com_delta_received'], mlp_labels)
seaborn.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=['delta_not_received', 'delta_received'], yticklabels=['delta_not_received', 'delta_received'])
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
#data_test['com_delta_received']
#mlp_labels

In [None]:
#ROC curve
x, y, _ = sklearn.metrics.roc_curve(data_test['com_delta_received'], mlp_labels)
roc_auc = sklearn.metrics.auc(x,y)

plt.figure()
plt.plot(x,y, color = 'darkorange', lw = 2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

# Idea 2: try classification sparately for each "cluster"

# Feature extraction

Idea: We realized that none of the implementations of the machine learning algorithms actually provides us with the list of features that are most reliable to classify comments. Thus, we are using feature extraction tools from sklearn.

Oh, we've observed that the features are very stable across algorithms, we could interpret these features by finding them in comments and see if these comments receive delta or not

## feature extraction using SelectKBest

In [5]:
X = features_train
y = data_train['com_delta_received'].as_matrix()

# def extract_f (func, k):
#     selector = sklearn.feature_selection.SelectKBest(func, k=k).fit(X,y)
#     X_new = selector.transform(X)
#     X_inverse = selector.inverse_transform(X_new)
#     X_inverse = pd.DataFrame(X_inverse)
    
#     l = X_inverse.sum(axis = 0)
#     feature_indices = []
#     for i in range(l.shape[0]):
#         if l[i] != 0:
#            feature_indices.append(i)
#     #feature_indices
#     selected_features = features_train.columns[feature_indices].get_values()
#     return selected_features

def extract_KBest(func, k):
    selector = SelectKBest(func, k=k).fit(X,y)
    feature_indices = selector.get_support(indices=True)
    
    selected_features = features_train.columns[feature_indices].get_values()
    scores = selector.scores_[feature_indices]
    pvalues = selector.pvalues_[feature_indices]
    return (selected_features, scores, pvalues)
    


In [6]:
#f_classif
selected_features_f, scores_f, pvalues_f = extract_KBest(sklearn.feature_selection.f_classif, 20)
selected_features_f

array([518, 2126, 4005, 4008, 5527, 6871, 8004, 8257, 8404, 8853, 9388,
       11195, 11720, 13361, 13487, 13560, 14947, 15896, 19048,
       'com_upvotes'], dtype=object)

In [7]:
tf_features_indices_f = selected_features_f[:-1].astype(int)
all_features_f  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_f].tolist()
all_features_f.append(selected_features_f[-1])
#all_features_f
d = {'all_features_f': all_features_f, 'scores_f': scores_f, 'pvalues_f': pvalues_f}
F_f_DF = pd.DataFrame(data = d).sort(columns='scores_f', axis=0, ascending=False).reset_index(drop = True)
F_f_DF



Unnamed: 0,all_features_f,pvalues_f,scores_f
0,com_upvotes,2.3026400000000002e-66,313.144981
1,iii,1.559025e-28,125.6552
2,setbacks,1.5797950000000002e-28,125.627707
3,pragmatism,1.767147e-28,125.394904
4,republican establishment,1.9998610000000002e-28,125.137948
5,populism,1.9998610000000002e-28,125.137948
6,integration,2.747173e-28,124.478618
7,demand multiple,3.29895e-28,124.098604
8,money like,4.555053e-28,123.428897
9,goal students,6.8968360000000005e-28,122.568095


In [21]:
#chi2 - we can't use this algo because some values of upvotes are negative
selector = SelectKBest(mutual_info_classif, 20).fit(X,y)
    #feature_indices = selector.get_support(indices=True)
    
    #selected_features = features_train.columns[feature_indices].get_values()
    #scores = selector.scores_[feature_indices]
    #pvalues = selector.pvalues_[feature_indices]

In [None]:
feature_indices = selector.get_support(indices=True)

In [20]:
#mutual_info_classif
selected_features_m = extract_KBest(mutual_info_classif, 20)
selected_features_m

TypeError: 'NoneType' object is not subscriptable

In [None]:
tf_features_indices_m = selected_features_m[:-2].astype(int)
all_features_m  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_m].tolist()
all_features_m.extend(selected_features_m[-2:])
#all_features_m
d = {'all_features_m': all_features_m, 'scores_m': scores_m, 'pvalues_m': pvalues_m}
F_m_DF = pd.DataFrame(data = d).sort(columns='scores_m', axis=0, ascending=False).reset_index(drop = True)
F_m_DF

In [None]:
F_m_DF

## Feature extraction using Tree-based feature selection

In [None]:
#testing
#feature_indices = SelectFromModel(tree_clf, prefit=True).get_support(indices=True)
#tree_clf.feature_importances_[feature_indices]
#selected_features = features_train.columns[feature_indices].get_values()

In [None]:
#pd.DataFrame(tree_clf.feature_importances_).sort(columns = [0],axis=0, ascending=False).iloc[:feature_indices.shape[0], :]

In [41]:
def extract_model(clf_fitted):
    feature_indices = SelectFromModel(clf_fitted, prefit=True).get_support(indices=True)
    try:
        score = clf_fitted.feature_importances_[feature_indices]
    except:
        score = None
    selected_features = features_train.columns[feature_indices].get_values()
    return (selected_features, score)

In [36]:
tree_clf = DecisionTreeClassifier(max_depth=10,random_state=0).fit(X, y)

In [37]:
selected_features_t1, scores_t1 = extract_model(tree_clf)
selected_features_t1

array([2126, 2344, 2624, 7181, 7320, 7720, 8004, 9035, 9084, 10669, 13361,
       15896, 16815, 16952, 17375, 17493, 17708, 19144, 19664,
       'com_upvotes', 'kmeans', 'com_avg_pt_depth'], dtype=object)

In [38]:
selected_features_t1.shape

(22,)

In [28]:
tf_features_indices_t1 = selected_features_t1[:-3].astype(int)
all_features_t1  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_t1].tolist()
all_features_t1.extend(selected_features_t1[-3:])
#all_features_t1

d = {'all_features_t1': all_features_t1, 'scores_t1': scores_t1}
F_t1_DF = pd.DataFrame(data = d).sort(columns='scores_t1', axis=0, ascending=False).reset_index(drop = True)
F_t1_DF.iloc[:20, :]



Unnamed: 0,all_features_t1,scores_t1
0,com_upvotes,0.142943
1,wonder,0.064636
2,massive,0.060952
3,caliphate,0.05916
4,setbacks,0.05907
5,iii,0.058979
6,populism,0.05889
7,killing,0.054647
8,com_avg_pt_depth,0.050429
9,want equality,0.039423


In [29]:
tree2_clf = ExtraTreesClassifier(n_estimators=250,random_state=0).fit(X, y)

In [15]:
selected_features_t2, scores_t2 = extract_model(tree2_clf)
#selected_features_t2[2700:]

In [31]:
selected_features_t2[2600:]

array([19878, 19918, 19931, 19935, 19938, 19945, 19946, 19948, 19956,
       19960, 19983, 19984, 19985, 19986, 19988, 19990, 19991, 19993,
       20014, 20016, 20024, 20034, 20047, 'com_upvotes', 'KL', 'JS',
       'kmeans', 'com_avg_pt_depth'], dtype=object)

In [32]:
tf_features_indices_t2 = selected_features_t2[:-5].astype(int)
all_features_t2  = np.array(TFVectorizer.get_feature_names())[tf_features_indices_t2].tolist()
all_features_t2.extend(selected_features_t2[-5:])
#all_features_t1

d = {'all_features_t2': all_features_t2, 'scores_t2': scores_t2}
F_t2_DF = pd.DataFrame(data = d).sort(columns='scores_t2', axis=0, ascending=False).reset_index(drop = True)
F_t2_DF.iloc[:20, :]



Unnamed: 0,all_features_t2,scores_t2
0,com_upvotes,0.142943
1,wonder,0.064636
2,massive,0.060952
3,caliphate,0.05916
4,setbacks,0.05907
5,iii,0.058979
6,populism,0.05889
7,killing,0.054647
8,com_avg_pt_depth,0.050429
9,want equality,0.039423


In [33]:
F_DF = pd.concat([F_f_DF, F_t1_DF.iloc[:20, :], F_t2_DF.iloc[:20, :]], axis =1)
F_DF

Unnamed: 0,all_features_f,pvalues_f,scores_f,all_features_t1,scores_t1,all_features_t2,scores_t2
0,com_upvotes,2.3026400000000002e-66,313.144981,com_upvotes,0.142943,com_upvotes,0.142943
1,iii,1.559025e-28,125.6552,wonder,0.064636,wonder,0.064636
2,setbacks,1.5797950000000002e-28,125.627707,massive,0.060952,massive,0.060952
3,pragmatism,1.767147e-28,125.394904,caliphate,0.05916,caliphate,0.05916
4,republican establishment,1.9998610000000002e-28,125.137948,setbacks,0.05907,setbacks,0.05907
5,populism,1.9998610000000002e-28,125.137948,iii,0.058979,iii,0.058979
6,integration,2.747173e-28,124.478618,populism,0.05889,populism,0.05889
7,demand multiple,3.29895e-28,124.098604,killing,0.054647,killing,0.054647
8,money like,4.555053e-28,123.428897,com_avg_pt_depth,0.050429,com_avg_pt_depth,0.050429
9,goal students,6.8968360000000005e-28,122.568095,want equality,0.039423,want equality,0.039423


## Feature extraction using Neural nets - mlp

In [None]:
log_clf = linear_model.LogisticRegression()
log_clf.fit(X, Y)

In [42]:
selected_features_log, scores_log = extract_model(log_clf)
selected_features_log.shape

ValueError: The underlying estimator MLPClassifier has no `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to SelectFromModel or call fit before calling transform.

In [None]:
scores_log

In [None]:
tf_features_indices_log = selected_features_log[:-5].astype(int)
all_features_log = np.array(TFVectorizer.get_feature_names())[tf_features_indices_log].tolist()
all_features_log.extend(selected_features_log[-5:])
#all_features_t1

d = {'all_features_log': all_features_log, 'scores_lo': scores_t2}
F_t2_DF = pd.DataFrame(data = d).sort(columns='scores_t2', axis=0, ascending=False).reset_index(drop = True)
F_t2_DF.iloc[:20, :]