In [94]:
import pandas as pd
import numpy as np

from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [65]:
def tokenize(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text)

In [68]:
NAACL_DATA = 'training_data/NAACL_revised.csv'
training_data = pd.read_csv(NAACL_DATA, usecols=['label', 'text'])

tweets = list(training_data.text)
labels = np.array(training_data.label)

X_train, X_test, y_train, y_test = train_test_split(tweets, labels)

vectorizer = TfidfVectorizer(tokenizer=tokenize)

In [70]:
svm_clf = Pipeline([
    ('tfidf', vectorizer),
    ('svm', SVC(gamma='scale', class_weight='balanced'))
])

%time svm_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, svm_clf.predict(X_test)))

Wall time: 20.2 s
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      2060
           1       0.89      0.35      0.50       738

   micro avg       0.82      0.82      0.82      2798
   macro avg       0.85      0.67      0.70      2798
weighted avg       0.83      0.82      0.79      2798



In [69]:
bayes_clf = Pipeline([
    ('tfidf', vectorizer),
    ('naive_bayes', ComplementNB(alpha=0.3))
])

%time bayes_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, bayes_clf.predict(X_test)))

Wall time: 795 ms
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      2060
           1       0.73      0.75      0.74       738

   micro avg       0.86      0.86      0.86      2798
   macro avg       0.82      0.83      0.82      2798
weighted avg       0.86      0.86      0.86      2798



In [80]:
adaboost_clf = Pipeline([
    ('tfidf', vectorizer),
    ('adaboost', AdaBoostClassifier())
])

%time adaboost_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, adaboost_clf.predict(X_test)))

Wall time: 2.15 s
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      2060
           1       0.84      0.55      0.67       738

   micro avg       0.85      0.85      0.85      2798
   macro avg       0.85      0.76      0.79      2798
weighted avg       0.85      0.85      0.84      2798



In [83]:
rforest_clf = Pipeline([
    ('tfidf', vectorizer),
    ('random_forest', RandomForestClassifier(n_estimators=10))
])

%time rforest_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, rforest_clf.predict(X_test)))

Wall time: 3.13 s
              precision    recall  f1-score   support

           0       0.84      0.98      0.91      2060
           1       0.92      0.50      0.64       738

   micro avg       0.86      0.86      0.86      2798
   macro avg       0.88      0.74      0.78      2798
weighted avg       0.86      0.86      0.84      2798



In [100]:
voting_clf = Pipeline([
    ('tfidf', vectorizer),
    ('clf', VotingClassifier(
        estimators=[
            ('naive_bayes', ComplementNB(alpha=0.3)),
            ('random_forest', RandomForestClassifier(n_estimators=10))
        ],
        voting='soft',
        weights=[2,1]
    ))
])

%time voting_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, voting_clf.predict(X_test)))

Wall time: 1.14 s
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      2060
           1       0.82      0.69      0.75       738

   micro avg       0.88      0.88      0.88      2798
   macro avg       0.86      0.82      0.84      2798
weighted avg       0.88      0.88      0.88      2798



In [106]:
#Found these (hopefully) optimized parameters with a GridSearchCV
voting_clf = Pipeline([
    ('tfidf', vectorizer),
    ('clf', VotingClassifier(
        estimators=[
            ('naive_bayes', ComplementNB(alpha=0.1)),
            ('random_forest', RandomForestClassifier(n_estimators=50))
        ],
        voting='soft',
        weights=[1,1]
    ))
])

%time voting_clf.fit(X_train, y_train)
print(metrics.classification_report(y_test, voting_clf.predict(X_test)))

Wall time: 3.44 s
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      2060
           1       0.84      0.70      0.76       738

   micro avg       0.89      0.89      0.89      2798
   macro avg       0.87      0.83      0.84      2798
weighted avg       0.88      0.89      0.88      2798

