In [1]:
import sys,csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import (make_pipeline, Pipeline)
from sklearn.metrics import make_scorer
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV)
from sklearn.utils import resample
import pandas as pd
import itertools

In [2]:
test_variables = ['INTERACTIVITY_DUMMY','INCIVILITY_DUMMY','HATELIST_FOCUSED_DUMMY',
                 'RATIONALITY_DUMMY','HAS_OPINION_DUMMY','LIBERAL_DUMMY','CONSERVATIVE_DUMMY']

In [3]:
def down_sample_majority(df, majortopic):
        majority = int(len(df[df[majortopic]==0])/len(df)<0.5) # when the ratio of label=0 < .5, majority = 1, else majority = 0
        monority = 1 - majority # if majority = 1 then minority = 0, and vice versa
        df_majority = df[df[majortopic]==majority]
        df_minority = df[df[majortopic]==monority]
        df_majority_downsampled = resample(df_majority,
                                         replace=False,     #
                                         n_samples=len(df[df[majortopic]==1]), # set to N of minority topic
                                         random_state=123) #

        df_downsampled = pd.concat([df_minority, df_majority_downsampled])
        return df_downsampled

### Machine Learning

In [4]:
def combine_configuration():
    Vectorizers = [CountVectorizer, TfidfVectorizer]
    Classifiers = [MultinomialNB(), LogisticRegression(max_iter=1000),
                   SVC(kernel='rbf', class_weight="balanced"), SVC(kernel='linear', class_weight="balanced")
                  ]
    config = [Vectorizers, Classifiers]
    configurations = list(itertools.product(*config))
    return configurations

In [5]:
def machine_learning(train, test, labels):
    acc = pd.DataFrame(columns = ['Vectorizer', 'Classifier','Parameters', 'F1_score','Recall','Precision','Accuracy','Ratio_resampled'])

    df_downsampled = down_sample_majority(train, labels)
    train_labels = df_downsampled[labels]
    train_texts = df_downsampled['commentText']
    test_labels = test[labels]
    test_texts = test['commentText']
#    train_texts, test_texts, train_labels, test_labels = train_test_split(df_downsampled['commentText'].to_list(), df_downsampled[labels].to_list(), test_size=0.2, random_state=42)
#    print(f'after undersampling:\ntrain: {len(train_labels)}, test: {len(test_labels)}')
#    print(Counter(train_labels))

    configurations = combine_configuration()
    
    for vectorizer, classifier in configurations:
        pipeline = Pipeline(steps = [
          ("vectorizer", vectorizer()), 
          ("classifier", classifier)])

        grid = {"vectorizer__ngram_range": [(1,1), (1,2)],
                "vectorizer__max_df": [0.5, 1.0],
                "vectorizer__min_df": [0, 5],
                "classifier__C": [0.01, 1, 100]
               }
        
        try:
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring='f1', cv=5)
            search.fit(train_texts, train_labels)
        except:
            #print('regularization is not applicable')
            grid.pop('classifier__C')
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring='f1', cv=5)
            search.fit(train_texts, train_labels)
        #print(search.cv_results_['split1_test_score'])
        y_pred = search.predict(test_texts)
        acc = acc.append({'Vectorizer':vectorizer, 'Classifier':classifier,'Parameters':search.best_params_, 
                          'F1_score':metrics.f1_score(test_labels, y_pred),'Recall':metrics.recall_score(test_labels, y_pred),
                          'Precision':metrics.precision_score(test_labels, y_pred),'Accuracy':metrics.accuracy_score(test_labels, y_pred),
                          'Ratio_resampled':Counter(train_labels)[1]/(len(train_labels)),'Manual':test_labels,'Prediction':y_pred,
                          'split0_test_score':search.cv_results_['split0_test_score'].mean(),'split1_test_score':search.cv_results_['split1_test_score'].mean(),
                          'split2_test_score':search.cv_results_['split2_test_score'].mean(),'split3_test_score':search.cv_results_['split3_test_score'].mean(),
                          'split4_test_score':search.cv_results_['split4_test_score'].mean(),
                          'split0_test_score_std':search.cv_results_['split0_test_score'].std(),'split1_test_score_std':search.cv_results_['split1_test_score'].std(),
                          'split2_test_score_std':search.cv_results_['split2_test_score'].std(),'split3_test_score_std':search.cv_results_['split3_test_score'].std(),
                          'split4_test_score_std':search.cv_results_['split4_test_score'].std()},ignore_index=True)
    best_classifier = acc[acc['F1_score'] == acc['F1_score'].max()].reset_index()
    #print('algorithm with maximum F1_score:', best_classifier)
    return acc, best_classifier['Prediction'][0]

In [None]:
train_set = pd.read_csv('data/train.csv')[test_variables+['ID']+['commentText']]  
test_set = pd.read_csv('data/test.csv')[test_variables+['ID']+['commentText']]  
accuracy = pd.DataFrame(columns = ['Variable', 'Vectorizer', 'Classifier','Parameters', 'F1_score','Recall','Precision','Accuracy','Ratio_test','Ratio_resampled','Manual','Prediction'])

for v in test_variables:
    print(v)
    acc,prediction = machine_learning(train_set, test_set, v)
    test_set[v+'_ML'] = prediction

    acc['Variable'] = v
    acc['Ratio_test'] = test_set[v].mean()
    acc['Ratio_prediction'] = prediction.mean()
    accuracy = accuracy.append(acc,ignore_index=True)
accuracy.to_csv(f'outputs/evaluation/ML_accuracy.csv')

INTERACTIVITY_DUMMY
INCIVILITY_DUMMY
HATELIST_FOCUSED_DUMMY
RATIONALITY_DUMMY


In [None]:
best_models = pd.DataFrame()
for v in test_variables:
    select_var = accuracy[accuracy['Variable'] == v]
    best_models = best_models.append(select_var[select_var['F1_score'] == select_var['F1_score'].max()],ignore_index=True)
best_models

In [None]:
test_set.to_csv('outputs/automated_results/prediction_ML.csv',index=False)