In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
train2 = pd.read_csv("data/train2.csv")
test2 = pd.read_csv("data/test2.csv")
val2 = pd.read_csv("data/val2.csv")

In [3]:
train2["text"] = train2["statement"] + train2["justification"]
test2["text"] = test2["statement"] + test2["justification"]
val2["text"] = val2["statement"] + val2["justification"]

In [4]:
train2 = train2.dropna(subset=['text'])
test2 = test2.dropna(subset=['text'])
val2 = val2.dropna(subset=['text'])

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

countV = CountVectorizer()
tfidf_ngram = TfidfVectorizer(stop_words='english', ngram_range=(1,4), use_idf=True, smooth_idf=True)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

### Using CountVectorizer

In [7]:
nb_clf = Pipeline([('NBCV',countV), ('NB',MultinomialNB())])
nb_clf.fit(train2["text"], train2["label"])
predNB = nb_clf.predict(val2["text"])
print(accuracy_score(predNB, val2["label"]))
print(classification_report(predNB, val2["label"]))

0.247244094488189
              precision    recall  f1-score   support

           0       0.03      0.38      0.05         8
           1       0.28      0.25      0.26       281
           2       0.10      0.21      0.13       107
           3       0.44      0.23      0.30       462
           4       0.30      0.29      0.29       263
           5       0.21      0.23      0.22       149

    accuracy                           0.25      1270
   macro avg       0.22      0.27      0.21      1270
weighted avg       0.32      0.25      0.27      1270



In [8]:
logR_clf = Pipeline([('logRCV', countV), ('LogR', LogisticRegression())])
logR_clf.fit(train2["text"], train2["label"])
predlogR = logR_clf.predict(val2["text"])
print(accuracy_score(predlogR, val2["label"]))
print(classification_report(predlogR, val2["label"]))



0.20708661417322835
              precision    recall  f1-score   support

           0       0.14      0.16      0.15        99
           1       0.21      0.19      0.20       278
           2       0.14      0.22      0.17       146
           3       0.29      0.23      0.26       298
           4       0.21      0.25      0.23       211
           5       0.23      0.16      0.19       238

    accuracy                           0.21      1270
   macro avg       0.20      0.20      0.20      1270
weighted avg       0.22      0.21      0.21      1270





In [9]:
svm_clf = Pipeline([('SVMCV', countV), ('SVM', LinearSVC())])
svm_clf.fit(train2["text"], train2["label"])
predSVM = svm_clf.predict(val2["text"])
print(accuracy_score(predSVM, val2["label"]))
print(classification_report(predSVM, val2["label"]))

0.2047244094488189
              precision    recall  f1-score   support

           0       0.13      0.15      0.14        97
           1       0.18      0.18      0.18       257
           2       0.18      0.21      0.19       206
           3       0.24      0.22      0.23       258
           4       0.23      0.25      0.24       233
           5       0.24      0.19      0.21       219

    accuracy                           0.20      1270
   macro avg       0.20      0.20      0.20      1270
weighted avg       0.21      0.20      0.21      1270





In [10]:
sgd_clf = Pipeline([('SGDCV', countV), ('SGD', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3))])
sgd_clf.fit(train2["text"], train2["label"])
predSGD = sgd_clf.predict(val2["text"])
print(accuracy_score(predSGD, val2["label"]))
print(classification_report(predSGD, val2["label"]))

0.21181102362204723
              precision    recall  f1-score   support

           0       0.17      0.11      0.14       166
           1       0.20      0.23      0.21       226
           2       0.14      0.21      0.17       160
           3       0.33      0.23      0.27       358
           4       0.19      0.28      0.23       168
           5       0.22      0.19      0.21       192

    accuracy                           0.21      1270
   macro avg       0.21      0.21      0.20      1270
weighted avg       0.23      0.21      0.21      1270



In [11]:
rf_clf = Pipeline([('RFCV', countV), ('RF', RandomForestClassifier(n_estimators=150, n_jobs=-1, max_depth=8))])
rf_clf.fit(train2["text"], train2["label"])
predRF = rf_clf.predict(val2["text"])
print(accuracy_score(predRF, val2["label"]))
print(classification_report(predRF, val2["label"]))

0.1889763779527559
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.02      0.21      0.03        19
           2       0.00      0.00      0.00         0
           3       0.96      0.19      0.32      1239
           4       0.00      0.08      0.01        12
           5       0.00      0.00      0.00         0

    accuracy                           0.19      1270
   macro avg       0.16      0.08      0.06      1270
weighted avg       0.94      0.19      0.31      1270



  'recall', 'true', average, warn_for)


### Using N-grams

In [12]:
nb_clf_ngram = Pipeline([('NB_tfidf',tfidf_ngram), ('NB',MultinomialNB())])
nb_clf_ngram.fit(train2["text"], train2["label"])
predNB_ngram = nb_clf_ngram.predict(val2["text"])
print(accuracy_score(predNB_ngram, val2["label"]))
print(classification_report(predNB_ngram, val2["label"]))

0.22362204724409449
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.16      0.27      0.20       154
           2       0.00      0.00      0.00         1
           3       0.82      0.21      0.33       970
           4       0.16      0.29      0.21       140
           5       0.00      0.00      0.00         5

    accuracy                           0.22      1270
   macro avg       0.19      0.13      0.12      1270
weighted avg       0.66      0.22      0.30      1270



  'recall', 'true', average, warn_for)


In [13]:
logR_clf_ngram = Pipeline([('logR_tfidf', tfidf_ngram), ('LogR', LogisticRegression(penalty='l2', C=1))])
logR_clf_ngram.fit(train2["text"], train2["label"])
predlogR_ngram = logR_clf_ngram.predict(val2["text"])
print(accuracy_score(predlogR_ngram, val2["label"]))
print(classification_report(predlogR_ngram, val2["label"]))



0.25905511811023624
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.32      0.26      0.29       310
           2       0.05      0.28      0.08        39
           3       0.51      0.24      0.33       516
           4       0.37      0.28      0.32       332
           5       0.11      0.25      0.15        73

    accuracy                           0.26      1270
   macro avg       0.23      0.22      0.19      1270
weighted avg       0.39      0.26      0.30      1270



  'recall', 'true', average, warn_for)


In [14]:
svm_clf_ngram = Pipeline([('SVM_tfidf', tfidf_ngram), ('SVM', LinearSVC(dual=False))])
svm_clf_ngram.fit(train2["text"], train2["label"])
predSVM_ngram = svm_clf_ngram.predict(val2["text"])
print(accuracy_score(predSVM_ngram, val2["label"]))
print(classification_report(predSVM_ngram, val2["label"]))

0.2511811023622047
              precision    recall  f1-score   support

           0       0.06      0.35      0.10        20
           1       0.25      0.24      0.24       268
           2       0.14      0.22      0.17       150
           3       0.35      0.26      0.30       323
           4       0.37      0.27      0.31       337
           5       0.23      0.23      0.23       172

    accuracy                           0.25      1270
   macro avg       0.23      0.26      0.23      1270
weighted avg       0.29      0.25      0.26      1270



In [15]:
sgd_clf_ngram = Pipeline([('SGD_tfidf', tfidf_ngram), ('SGD', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3))])
sgd_clf_ngram.fit(train2["text"], train2["label"])
predSGD_ngram = sgd_clf_ngram.predict(val2["text"])
print(accuracy_score(predSGD_ngram, val2["label"]))
print(classification_report(predSGD_ngram, val2["label"]))

0.25196850393700787
              precision    recall  f1-score   support

           0       0.06      0.32      0.10        22
           1       0.24      0.24      0.24       261
           2       0.15      0.23      0.18       156
           3       0.37      0.26      0.30       347
           4       0.37      0.29      0.32       319
           5       0.20      0.21      0.20       165

    accuracy                           0.25      1270
   macro avg       0.23      0.26      0.23      1270
weighted avg       0.29      0.25      0.26      1270



In [16]:
rf_clf_ngram = Pipeline([('RF_tfidf', tfidf_ngram), ('RF', RandomForestClassifier(n_estimators=300, n_jobs=-1))])
rf_clf_ngram.fit(train2["text"], train2["label"])
predRF_ngram = rf_clf_ngram.predict(val2["text"])
print(accuracy_score(predRF_ngram, val2["label"]))
print(classification_report(predRF_ngram, val2["label"]))

0.27086614173228346
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.49      0.25      0.33       503
           2       0.02      0.23      0.04        22
           3       0.44      0.28      0.34       396
           4       0.35      0.30      0.32       285
           5       0.11      0.28      0.16        64

    accuracy                           0.27      1270
   macro avg       0.23      0.22      0.20      1270
weighted avg       0.42      0.27      0.32      1270



  'recall', 'true', average, warn_for)
