In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
train2 = pd.read_csv("data/train2.csv")
test2 = pd.read_csv("data/test2.csv")
val2 = pd.read_csv("data/val2.csv")

In [3]:
train2["text"] = train2["statement"] + train2["justification"]
test2["text"] = test2["statement"] + test2["justification"]
val2["text"] = val2["statement"] + val2["justification"]

In [4]:
train2 = train2.dropna(subset=['text'])
test2 = test2.dropna(subset=['text'])
val2 = val2.dropna(subset=['text'])

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

countV = CountVectorizer()
tfidf_ngram = TfidfVectorizer(stop_words='english', ngram_range=(1,4), use_idf=True, smooth_idf=True)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

### Using CountVectorizer

In [7]:
nb_clf = Pipeline([('NBCV',countV), ('NB',MultinomialNB())])
nb_clf.fit(train2["text"], train2["label"])
predNB = nb_clf.predict(val2["text"])
np.mean(predNB == val2["label"])

0.247244094488189

In [8]:
logR_clf = Pipeline([('logRCV', countV), ('LogR', LogisticRegression())])
logR_clf.fit(train2["text"], train2["label"])
predlogR = logR_clf.predict(val2["text"])
np.mean(predlogR == val2["label"])



0.20708661417322835

In [9]:
svm_clf = Pipeline([('SVMCV', countV), ('SVM', LinearSVC())])
svm_clf.fit(train2["text"], train2["label"])
predSVM = svm_clf.predict(val2["text"])
np.mean(predSVM == val2["label"])



0.2047244094488189

In [10]:
sgd_clf = Pipeline([('SGDCV', countV), ('SGD', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3))])
sgd_clf.fit(train2["text"], train2["label"])
predSGD = sgd_clf.predict(val2["text"])
np.mean(predSGD == val2["label"])

0.21338582677165355

In [11]:
rf_clf = Pipeline([('RFCV', countV), ('RF', RandomForestClassifier(n_estimators=150, n_jobs=-1, max_depth=8))])
rf_clf.fit(train2["text"], train2["label"])
predRF = rf_clf.predict(val2["text"])
np.mean(predRF == val2["label"])

0.18740157480314962