# Improvements: Dictionaries as input for machine learning

In [1]:
import sys,csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import (make_pipeline, Pipeline)
from sklearn.metrics import make_scorer
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV)
from sklearn.utils import resample
import pandas as pd
import itertools

In [2]:
test_variables = ['INTERACTIVITY_DUMMY','INCIVILITY_DUMMY','HATELIST_FOCUSED_DUMMY',
                 'RATIONALITY_DUMMY','HAS_OPINION_DUMMY','LIBERAL_DUMMY','CONSERVATIVE_DUMMY']

In [3]:
dictionaries_variable = [[['INCIVILITY_DUMMY','HATELIST_FOCUSED_DUMMY'],
                          ['Dict_Hostility_Ksiazek_2015', 'Dict_Civility_Ksiazek_2015', 'Dict_GoogeProject_OffensiveWords', 
                         'Dict_Incivility_Muddiman', 'Dict_Swearwords_LIWC', 'Dict_HatebaseVocabEN']], 
                         [['HAS_OPINION_DUMMY','LIBERAL_DUMMY','CONSERVATIVE_DUMMY'],
                          ['MFD1_conservative_ratio', 'MFD1_liberal_ratio', 'MFD2_conservative_ratio', 'MFD2_liberal_ratio']]
                        ]

In [4]:
def down_sample_majority(df, majortopic):
        majority = int(len(df[df[majortopic]==0])/len(df)<0.5) # when the ratio of label=0 < .5, majority = 1, else majority = 0
        monority = 1 - majority # if majority = 1 then minority = 0, and vice versa
        df_majority = df[df[majortopic]==majority]
        df_minority = df[df[majortopic]==monority]
        df_majority_downsampled = resample(df_majority,
                                         replace=False,     #
                                         n_samples=len(df[df[majortopic]==1])) # set to N of minority topic
                                        # random_state=123) #

        df_downsampled = pd.concat([df_minority, df_majority_downsampled]).sample(frac=1)
        print(len(df_minority))
        return df_downsampled

In [5]:
def machine_learning(train, test, labels, dictionaries):
    acc = pd.DataFrame(columns = ['Classifier','Parameters', 'F1_score','Recall','Precision','Accuracy','Ratio_resampled'])

    df_downsampled = down_sample_majority(train, labels)
    train_labels = df_downsampled[labels]
    train_data = df_downsampled[dictionaries]
    test_labels = test[labels]
    test_data = test[dictionaries]
#    train_texts, test_texts, train_labels, test_labels = train_test_split(df_downsampled['commentText'].to_list(), df_downsampled[labels].to_list(), test_size=0.2, random_state=42)
#    print(f'after undersampling:\ntrain: {len(train_labels)}, test: {len(test_labels)}')
#    print(Counter(train_labels))
    grid = {"classifier__C": [0.01, 1, 100]}
    Classifiers = [MultinomialNB(), LogisticRegression(max_iter=1000),
                   SVC(kernel='rbf', class_weight="balanced"), SVC(kernel='linear', class_weight="balanced")
                  ]
    
    for classifier in Classifiers:
        pipeline = Pipeline(steps = [("classifier", classifier)])
        try:
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring='f1', cv=5)
            search.fit(train_data, train_labels)
        except:
            grid = {}
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring='f1', cv=5)
            search.fit(train_data, train_labels)
        #print(search.cv_results_['split1_test_score'])
        y_pred = search.predict(test_data)
        acc = acc.append({'Classifier':classifier,
                          'F1_score':metrics.f1_score(test_labels, y_pred),'Recall':metrics.recall_score(test_labels, y_pred),
                          'Precision':metrics.precision_score(test_labels, y_pred),'Accuracy':metrics.accuracy_score(test_labels, y_pred),
                          'Ratio_resampled':Counter(train_labels)[1]/(len(train_labels)),'Manual':test_labels,'Prediction':y_pred},ignore_index=True)
    best_classifier = acc[acc['F1_score'] == acc['F1_score'].max()].reset_index()
    #print('algorithm with maximum F1_score:', best_classifier)
    return acc, best_classifier['Prediction'][0]

In [6]:
dictionaries = pd.read_csv('outputs/automated_results/incivility&diversity.csv').drop(columns = 'commentText')

train_set = pd.read_csv('data/train.csv')[test_variables+['ID']+['commentText']]
train_set = train_set.merge(dictionaries, how = 'left', on = 'ID')

test_set = pd.read_csv('data/test.csv')[test_variables+['ID']+['commentText']]  
test_set = test_set.merge(dictionaries, how = 'left', on = 'ID')
train_set

Unnamed: 0,INTERACTIVITY_DUMMY,INCIVILITY_DUMMY,HATELIST_FOCUSED_DUMMY,RATIONALITY_DUMMY,HAS_OPINION_DUMMY,LIBERAL_DUMMY,CONSERVATIVE_DUMMY,ID,commentText,Dict_Hostility_Ksiazek_2015,...,Dict_Swearwords_LIWC,Dict_HatebaseVocabEN,MFD1_conservative,MFD1_liberal,MFD2_conservative,MFD2_liberal,MFD1_conservative_ratio,MFD1_liberal_ratio,MFD2_conservative_ratio,MFD2_liberal_ratio
0,0,0,0,0,0,0,0,UgyPHwv8G0cDE6-wEgl4AaABAg.8_0ZjJKSJty8_0kXGkAd2U,sad,0,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
1,0,0,0,0,0,0,0,1110578710648890000,@colbertlateshow The question has always been ...,0,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
2,1,0,0,0,0,0,0,UgwWKCWtSJdFvjGHvTp4AaABAg.8kUC5dGrQ2H8kUDRihE2f3,hello hello \nNo-one else will hug him.,0,...,0,0,0,0,1,1,0.000000,0.000000,0.142857,0.142857
3,0,1,1,0,1,0,1,Ugw2eTvkZLfH9MDVg1R4AaABAg,"Please spare us you BS, Pocahantas! You will n...",0,...,1,0,0,0,1,0,0.000000,0.000000,0.157895,0.000000
4,1,1,1,0,1,0,0,Ugi7W7rSRtQSQngCoAEC.8SSDZOcrB8y8SSIYSKC0FI,william ...how 'bout if he just shoots himself...,0,...,0,0,0,1,0,1,0.000000,0.090909,0.000000,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3084,0,1,1,0,1,0,0,1167577566015890000,"@CBSEveningNews @CBSNews @MajorCBS Really, not...",0,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
3085,0,0,0,0,1,0,0,1152219467579100000,@FullFrontalSamB They can’t afford chemical pe...,0,...,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
3086,1,0,0,0,1,0,0,UghFY3QJ6nmT_ngCoAEC.7-H0Z7--wxd8goqpaPs-bl,"Nah, they knew all about the cameras. I'm gue...",0,...,0,1,0,0,0,0,0.000000,0.000000,0.000000,0.000000
3087,0,0,0,0,0,0,0,UgyWabsmmnq3zam4DgZ4AaABAg,Alexander Hamilton. Troops are waiting in the ...,0,...,0,0,1,1,1,1,0.021739,0.043478,0.108696,0.043478


In [7]:
accuracy = pd.DataFrame(columns = ['Variable', 'Classifier', 'F1_score','Recall','Precision','Accuracy','Ratio_test','Ratio_resampled','Manual','Prediction'])

for test_v, dictionaries in dictionaries_variable:
    for v in test_v:
        print(v)
        acc,prediction = machine_learning(train_set, test_set, v, dictionaries)
        test_set[v+'_ML'] = prediction

        acc['Variable'] = v
        acc['Ratio_test'] = test_set[v].mean()
        acc['Ratio_prediction'] = prediction.mean()
        accuracy = accuracy.append(acc,ignore_index=True)


accuracy

INCIVILITY_DUMMY
1522
HATELIST_FOCUSED_DUMMY
698
HAS_OPINION_DUMMY
1505
LIBERAL_DUMMY
649
CONSERVATIVE_DUMMY
478


Unnamed: 0,Variable,Classifier,F1_score,Recall,Precision,Accuracy,Ratio_test,Ratio_resampled,Manual,Prediction,Parameters,Ratio_prediction
0,INCIVILITY_DUMMY,MultinomialNB(),0.524771,0.391781,0.794444,0.664942,0.472186,0.5,0 1 1 0 2 1 3 1 4 0  ...,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",,0.344114
1,INCIVILITY_DUMMY,LogisticRegression(max_iter=1000),0.625806,0.531507,0.760784,0.699871,0.472186,0.5,0 1 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.344114
2,INCIVILITY_DUMMY,SVC(class_weight='balanced'),0.640254,0.553425,0.759398,0.706339,0.472186,0.5,0 1 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.344114
3,INCIVILITY_DUMMY,"SVC(class_weight='balanced', kernel='linear')",0.625806,0.531507,0.760784,0.699871,0.472186,0.5,0 1 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.344114
4,HATELIST_FOCUSED_DUMMY,MultinomialNB(),0.550633,0.54375,0.557692,0.8163,0.206986,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",,0.201811
5,HATELIST_FOCUSED_DUMMY,LogisticRegression(max_iter=1000),0.545455,0.54375,0.54717,0.812419,0.206986,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",,0.201811
6,HATELIST_FOCUSED_DUMMY,SVC(class_weight='balanced'),0.53303,0.73125,0.419355,0.734799,0.206986,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.201811
7,HATELIST_FOCUSED_DUMMY,"SVC(class_weight='balanced', kernel='linear')",0.508642,0.64375,0.420408,0.742561,0.206986,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.201811
8,HAS_OPINION_DUMMY,MultinomialNB(),0.65022,0.994609,0.482984,0.486417,0.479948,0.512787,0 0 1 0 2 1 3 1 4 1  ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",,0.988357
9,HAS_OPINION_DUMMY,LogisticRegression(max_iter=1000),0.619608,0.638814,0.601523,0.623545,0.479948,0.512787,0 0 1 0 2 1 3 1 4 1  ...,"[1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, ...",,0.988357


In [11]:
best_models = pd.DataFrame()
for v in test_variables:
    select_var = accuracy[accuracy['Variable'] == v]
    best_models = best_models.append(select_var[select_var['F1_score'] == select_var['F1_score'].max()],ignore_index=True)
best_models

Unnamed: 0,Variable,Classifier,F1_score,Recall,Precision,Accuracy,Ratio_test,Ratio_resampled,Manual,Prediction,Parameters,Ratio_prediction
0,INCIVILITY_DUMMY,SVC(class_weight='balanced'),0.640254,0.553425,0.759398,0.706339,0.472186,0.5,0 1 1 0 2 1 3 1 4 0  ...,"[1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, ...",,0.344114
1,HATELIST_FOCUSED_DUMMY,MultinomialNB(),0.550633,0.54375,0.557692,0.8163,0.206986,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",,0.201811
2,HAS_OPINION_DUMMY,MultinomialNB(),0.65022,0.994609,0.482984,0.486417,0.479948,0.512787,0 0 1 0 2 1 3 1 4 1  ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",,0.988357
3,LIBERAL_DUMMY,"SVC(class_weight='balanced', kernel='linear')",0.30837,1.0,0.182292,0.187581,0.181113,0.5,0 0 1 0 2 1 3 1 4 0  ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",,0.993532
4,CONSERVATIVE_DUMMY,SVC(class_weight='balanced'),0.268235,0.504425,0.182692,0.597671,0.146184,0.5,0 0 1 0 2 0 3 0 4 0  ...,"[1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...",,0.403622


## Update PRF1

In [8]:
from sklearn.metrics import classification_report

In [9]:
old = pd.read_csv('outputs/evaluation/PRF1_test.csv')

In [12]:
for index,data in best_models.iterrows():
    if data['Variable'] == 'HAS_OPINION_DUMMY':
        continue
    data['Manual'] = data['Manual'].astype(int)
    classification = classification_report(data['Manual'],data['Prediction'],output_dict=True)
    for c in ['macro avg','1','0']:
        old = old.append({'Dimension':'Incivility', 'Label':data['Variable'], 'Measures':'dict_as_input_for_ML','Class':c,'Precision':classification[c]['precision'],
                         'Recall':classification[c]['recall'],'F1-score':classification[c]['f1-score'],'support':classification[c]['support'],'Accuracy_overall':classification['accuracy']},ignore_index=True)

    print(data[['Variable','Classifier','Ratio_test']],'\n')


Variable                  INCIVILITY_DUMMY
Classifier    SVC(class_weight='balanced')
Ratio_test                        0.472186
Name: 0, dtype: object 

Variable      HATELIST_FOCUSED_DUMMY
Classifier           MultinomialNB()
Ratio_test                  0.206986
Name: 1, dtype: object 

Variable                                      LIBERAL_DUMMY
Classifier    SVC(class_weight='balanced', kernel='linear')
Ratio_test                                         0.181113
Name: 3, dtype: object 

Variable                CONSERVATIVE_DUMMY
Classifier    SVC(class_weight='balanced')
Ratio_test                        0.146184
Name: 4, dtype: object 



In [13]:
old.to_csv('outputs/evaluation/PRF1_test.csv',index=False)