In [107]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix,f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from Utilities import preprocessing, plots

In [80]:
tweets, labels = preprocessing.parse_dataset('datasets/train/SemEval2018-T3-train-taskA.txt')
print(tweets[:5])
print(labels[:5])

['Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR', "@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)", 'Hey there! Nice to see you Minnesota/ND Winter Weather', "3 episodes left I'm dying over here", '"I can\'t breathe!" was chosen as the most notable quote of the year in an annual list released by a Yale University librarian']
[1, 1, 1, 0, 1]


In [81]:
baseline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

In [108]:
def show_scores(model, x = tweets, y = labels):
    print('Cross val score', cross_val_score(model, x, y, cv=3, n_jobs=2))
    predict = cross_val_predict(model, x, y, cv=3, n_jobs=2)
    print('Accuracy',accuracy_score(y, predict))
    print('Precision',precision_score(y, predict))
    print('Recall',recall_score(y, predict))
    print('F1', f1_score(y, predict))
    print("Confusion Matrix:")
    print(confusion_matrix(y, predict))

In [109]:
show_scores(baseline)

Cross val score [ 0.6713615   0.63849765  0.64084507]
Accuracy 0.650234741784
Precision 0.651757188498
Recall 0.640502354788
F1 0.646080760095
Confusion Matrix:
[[1269  654]
 [ 687 1224]]
