In [1]:
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import numpy as np
from statistics import mean
import pandas as pd
pd.set_option('display.max_columns', None)

In [6]:
path_to_csv = './data/'
csvs = [pos_csv for pos_csv in os.listdir(path_to_csv) if pos_csv.endswith('-full.csv')]
#csvs = ['askmen-full.csv']
csv_dict = dict()
for page in csvs:
    df = pd.read_csv(path_to_csv + page, low_memory=False)
    csv_dict[page] = df

In [7]:
# remove columns that aren't predictive
for key in csv_dict.keys():
    csv_dict[key].drop('post_title', axis=1, inplace=True)
    csv_dict[key].drop('post_gilded', axis=1, inplace=True)
    csv_dict[key].drop('post_gilded_silver', axis=1, inplace=True)
    csv_dict[key].drop('post_gilded_gold', axis=1, inplace=True)
    csv_dict[key].drop('post_gilded_platinum', axis=1, inplace=True)
    csv_dict[key].drop('post_likes', axis=1, inplace=True)
    csv_dict[key].drop('post_num_comments', axis=1, inplace=True)
    csv_dict[key].drop('post_num_crossposts', axis=1, inplace=True)
    csv_dict[key].drop('post_num_reports', axis=1, inplace=True)
    csv_dict[key].drop('post_ups', axis=1, inplace=True)
    csv_dict[key].drop('post_downs', axis=1, inplace=True)

In [8]:
#Remove columns that only have a singular value
for key in csv_dict.keys():
    unique = [c for c in csv_dict[key].columns if len(set(csv_dict[key][c])) == 1]
    print(key)
    print(unique)
    print()
    csv_dict[key].drop(unique, axis=1, inplace=True)

askmen-full.csv
['post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit']

askwomen-full.csv
['post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit']

aww-full.csv
['post_edited', 'post_over_18', 'post_pinned', 'subreddit', 'post_text_sentiment']

conspiracy-full.csv
['post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit']

fitness-full.csv
['post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit']

knitting-full.csv
['post_is_original_content', 'post_pinned', 'subreddit']



In [9]:
for key in csv_dict.keys():
    print('parsing', key)
    #Change post_distinguished to numerical values, only None and Moderator values
    if 'post_distinguished' in csv_dict[key]:
        csv_dict[key]['post_distinguished'] = csv_dict[key]['post_distinguished'].map({None: 0, 'moderator': 1})
        
    #Mark edited posts with a 1
    if 'post_edited' in csv_dict[key]:
        csv_dict[key]['post_edited'] = (csv_dict[key]['post_edited'] == 'False')*1
        
    #Make post_over_18 binary
    if 'post_over_18' in csv_dict[key]:
        csv_dict[key]['post_over_18'] = csv_dict[key]['post_over_18'].astype('category').cat.codes
        
    #csv_dict[key] = csv_dict[key].dropna(csv_dict[key].median())

print('done')

parsing askmen-full.csv
parsing askwomen-full.csv
parsing aww-full.csv
parsing conspiracy-full.csv
parsing fitness-full.csv
parsing knitting-full.csv
done


In [12]:
for key in csv_dict.keys():
    cutoff_score = csv_dict[key]['post_score'].loc[csv_dict[key]['is_original'] == 1].describe()[6]
    csv_dict[key].loc[csv_dict[key]['post_score'] >= cutoff_score, 'popular'] = 1
    csv_dict[key].loc[csv_dict[key]['post_score'] < cutoff_score, 'popular'] = 0
    csv_dict[key].drop('is_original', axis=1, inplace=True)

In [13]:
def prepare_data(data):
    
    print(data.groupby(by='popular')['post_title_words'].count())
    
    
    #remove cols
    if 'post_score' in data.columns:
        data.drop('post_score', axis=1, inplace=True)
    
    #split into test and training
    X = data.drop('popular', axis=1)
    y = data['popular']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    #split into validation and training
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    
    #scale the data
    X_train = scaler.fit_transform(X_train)
    #X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    #return data
    #return X_train, X_val, X_test, y_train, y_val, y_test
    return X_train, X_test, y_train, y_test

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gnb = GaussianNB()
knn = KNeighborsClassifier()
svc = SVC()

classifiers = [dtc, rfc, gnb, knn, svc]

In [16]:
names = ['Decision Tree', 'Random Forest', 'Gaussian Naive Bayes', 'K-Nearest Neighbors', 'Support Vector Classifer']
cols = ['# Instances', '# Features'] + names + ['Hard Voting', 'Soft Voting', 'Bagging']
val_results = pd.DataFrame(index=csv_dict.keys(), columns=cols)
val_results.head()

Unnamed: 0,# Instances,# Features,Decision Tree,Random Forest,Gaussian Naive Bayes,K-Nearest Neighbors,Support Vector Classifer,Hard Voting,Soft Voting,Bagging
askmen-full.csv,,,,,,,,,,
askwomen-full.csv,,,,,,,,,,
aww-full.csv,,,,,,,,,,
conspiracy-full.csv,,,,,,,,,,
fitness-full.csv,,,,,,,,,,


In [17]:
from sklearn.metrics import confusion_matrix
#accuracy_scores = []

for key in csv_dict.keys():
    #X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(csv_dict[key])
    X_train, X_test, y_train, y_test = prepare_data(csv_dict[key])
    print('Subreddit:', key)
    for name, algo in zip(names, classifiers):
        if len(set(y_test.values)) == 2:
            algo.fit(X_train, y_train)
            #pred = algo.predict(X_val)
            pred = algo.predict(X_test)

            val_results.loc[key]['# Instances'] = X_train.shape[0]
            val_results.loc[key]['# Features'] = X_train.shape[1]
            #val_results.loc[key][name] = round(accuracy_score(y_val, pred), 3)
            val_results.loc[key][name] = round(accuracy_score(y_test, pred), 3)
            
            
            print(name)
            tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
            print('tn, fp, fn, tp')
            print(tn, fp, fn, tp)
print('done')

popular
0.0    738
1.0    734
Name: post_title_words, dtype: int64
Subreddit: askmen-full.csv
Decision Tree
tn, fp, fn, tp
137 11 47 100
Random Forest
tn, fp, fn, tp
134 14 47 100
Gaussian Naive Bayes
tn, fp, fn, tp
114 34 59 88
K-Nearest Neighbors
tn, fp, fn, tp
148 0 141 6
Support Vector Classifer
tn, fp, fn, tp
148 0 140 7
popular
0.0    740
1.0    732
Name: post_title_words, dtype: int64
Subreddit: askwomen-full.csv
Decision Tree
tn, fp, fn, tp
125 23 44 103
Random Forest
tn, fp, fn, tp
147 1 58 89
Gaussian Naive Bayes
tn, fp, fn, tp
127 21 60 87
K-Nearest Neighbors
tn, fp, fn, tp
147 1 147 0
Support Vector Classifer
tn, fp, fn, tp
148 0 140 7
popular
0.0    737
1.0    731
Name: post_title_words, dtype: int64
Subreddit: aww-full.csv
Decision Tree
tn, fp, fn, tp
131 17 51 95
Random Forest
tn, fp, fn, tp
146 2 60 86
Gaussian Naive Bayes
tn, fp, fn, tp
141 7 64 82
K-Nearest Neighbors
tn, fp, fn, tp
146 2 89 57
Support Vector Classifer
tn, fp, fn, tp
148 0 145 1
popular
0.0    738
1.0 

In [18]:
val_results

Unnamed: 0,# Instances,# Features,Decision Tree,Random Forest,Gaussian Naive Bayes,K-Nearest Neighbors,Support Vector Classifer,Hard Voting,Soft Voting,Bagging
askmen-full.csv,1177,16915,0.803,0.793,0.685,0.522,0.525,,,
askwomen-full.csv,1177,15885,0.773,0.8,0.725,0.498,0.525,,,
aww-full.csv,1174,12381,0.769,0.789,0.759,0.69,0.507,,,
conspiracy-full.csv,1174,27790,0.735,0.796,0.755,0.503,0.503,,,
fitness-full.csv,1148,15508,0.826,0.84,0.819,0.497,0.517,,,
knitting-full.csv,1180,16302,0.818,0.804,0.801,0.541,0.534,,,


In [19]:
hard_voting_clf = VotingClassifier(
    estimators=[('rf', rfc), ('dt', dtc)],
    voting='hard',
    n_jobs=-1)

soft_voting_clf = VotingClassifier(
    estimators=[('rf', rfc), ('dt', dtc)],
    voting='soft',
    n_jobs=-1)

bag_clf = BaggingClassifier(
    rfc, n_estimators=100,
    max_samples=500, bootstrap=True, n_jobs=-1
)


for key in csv_dict.keys():
    X_train, X_test, y_train, y_test = prepare_data(csv_dict[key])
    hard_voting_clf.fit(X_train, y_train)
    soft_voting_clf.fit(X_train, y_train)
    bag_clf.fit(X_train, y_train)
    
    y_pred_hard = hard_voting_clf.predict(X_test) 
    y_pred_soft = soft_voting_clf.predict(X_test) 
    y_pred_bag = bag_clf.predict(X_test)
    
    val_results.loc[key]['Hard Voting'] = round(accuracy_score(y_test, y_pred_hard), 3)
    val_results.loc[key]['Soft Voting'] = round(accuracy_score(y_test, y_pred_soft), 3)
    val_results.loc[key]['Bagging'] = round(accuracy_score(y_test, y_pred_bag), 3)
    

popular
0.0    738
1.0    734
Name: post_title_words, dtype: int64


  if diff:
  if diff:


popular
0.0    740
1.0    732
Name: post_title_words, dtype: int64


  if diff:
  if diff:


popular
0.0    737
1.0    731
Name: post_title_words, dtype: int64


  if diff:
  if diff:


popular
0.0    738
1.0    730
Name: post_title_words, dtype: int64


  if diff:
  if diff:


popular
0.0    713
1.0    723
Name: post_title_words, dtype: int64


  if diff:
  if diff:


popular
0.0    744
1.0    732
Name: post_title_words, dtype: int64


  if diff:
  if diff:


In [20]:
val_results

Unnamed: 0,# Instances,# Features,Decision Tree,Random Forest,Gaussian Naive Bayes,K-Nearest Neighbors,Support Vector Classifer,Hard Voting,Soft Voting,Bagging
askmen-full.csv,1177,16915,0.803,0.793,0.685,0.522,0.525,0.817,0.8,0.803
askwomen-full.csv,1177,15885,0.773,0.8,0.725,0.498,0.525,0.807,0.793,0.803
aww-full.csv,1174,12381,0.769,0.789,0.759,0.69,0.507,0.793,0.769,0.793
conspiracy-full.csv,1174,27790,0.735,0.796,0.755,0.503,0.503,0.796,0.759,0.779
fitness-full.csv,1148,15508,0.826,0.84,0.819,0.497,0.517,0.84,0.826,0.844
knitting-full.csv,1180,16302,0.818,0.804,0.801,0.541,0.534,0.814,0.811,0.811


In [22]:
val_results[['# Instances', '# Features', 'Decision Tree', 'Random Forest', 'Hard Voting', 'Soft Voting', 'Bagging']]

Unnamed: 0,# Instances,# Features,Decision Tree,Random Forest,Hard Voting,Soft Voting,Bagging
askmen-full.csv,1177,16915,0.803,0.793,0.817,0.8,0.803
askwomen-full.csv,1177,15885,0.773,0.8,0.807,0.793,0.803
aww-full.csv,1174,12381,0.769,0.789,0.793,0.769,0.793
conspiracy-full.csv,1174,27790,0.735,0.796,0.796,0.759,0.779
fitness-full.csv,1148,15508,0.826,0.84,0.84,0.826,0.844
knitting-full.csv,1180,16302,0.818,0.804,0.814,0.811,0.811
