In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [15]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from Utilities import plots
from Utilities.preprocessing import parse_dataset

In [3]:
tweets, labels = parse_dataset('datasets/train/SemEval2018-T3-train-taskA.txt')
print(tweets[:5])
print(labels[:5])

['Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR', "@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)", 'Hey there! Nice to see you Minnesota/ND Winter Weather', "3 episodes left I'm dying over here", '"I can\'t breathe!" was chosen as the most notable quote of the year in an annual list released by a Yale University librarian']
[1, 1, 1, 0, 1]


In [30]:
baseline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(C=10))
])

In [31]:
def show_scores(model, x = tweets, y = labels):
    cv_score = cross_val_score(model, x, y, cv=5, n_jobs=-1)
    print('Cross val score', cv_score, cv_score.mean())
    predict = cross_val_predict(model, x, y, cv=5, n_jobs=-1)
    print(classification_report(y_pred =predict, y_true=y))
    print("Confusion Matrix:")
    print(confusion_matrix(y, predict))

In [32]:
show_scores(baseline)

Cross val score [ 0.66276042  0.65449804  0.62581486  0.67885117  0.61227154] 0.646839207901
             precision    recall  f1-score   support

          0       0.64      0.66      0.65      1923
          1       0.65      0.63      0.64      1911

avg / total       0.65      0.65      0.65      3834

Confusion Matrix:
[[1272  651]
 [ 703 1208]]


In [33]:
from sklearn.svm import LinearSVC
show_scores(Pipeline([
    ('count', CountVectorizer()),
    ('logreg', LinearSVC(C=0.01))
]))

Cross val score [ 0.6640625   0.65189048  0.60886571  0.66971279  0.61096606] 0.641099508827
             precision    recall  f1-score   support

          0       0.65      0.61      0.63      1923
          1       0.63      0.67      0.65      1911

avg / total       0.64      0.64      0.64      3834

Confusion Matrix:
[[1179  744]
 [ 632 1279]]
