In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
import pandas as pd
import numpy as np
import random

In [2]:
train_data = pd.read_pickle('/tweet-fd/train.p')
dev_data = pd.read_pickle('/tweet-fd/val.p')
test_data = pd.read_pickle('/tweet-fd//test.p')

In [3]:
test_acc_list, test_f1_list, test_cf_list = [], [], []
for random_seed in range(100, 501, 100):
    random.seed(random_seed)
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(random_state=random_seed)),
    ])
    np.random.seed(random_seed)
    indices = train_data.index.tolist()
    np.random.shuffle(indices)
    new_x = train_data['tweet'].copy()
    new_x = new_x[indices]
    new_y = train_data['sentence_class'].copy()
    new_y = new_y[indices]
    text_clf.fit(new_x, new_y)
    predicted = text_clf.predict(test_data['tweet'])
    test_acc = accuracy_score(test_data['sentence_class'], predicted)
    test_f1 = f1_score(test_data['sentence_class'], predicted)
    test_cf = confusion_matrix(test_data['sentence_class'], predicted)
    test_acc_list.append(test_acc)
    test_f1_list.append(test_f1)
    test_cf_list.append(test_cf)
    print('test accuracy:', test_acc)
    print('test f1:', test_f1)
    print('test confusion matrix', test_cf)
    del text_clf, indices, new_x, new_y

test accuracy: 0.7672955974842768
test f1: 0.8477366255144032
test confusion matrix [[ 38  64]
 [ 10 206]]
test accuracy: 0.7672955974842768
test f1: 0.8477366255144032
test confusion matrix [[ 38  64]
 [ 10 206]]
test accuracy: 0.7672955974842768
test f1: 0.8477366255144032
test confusion matrix [[ 38  64]
 [ 10 206]]
test accuracy: 0.7672955974842768
test f1: 0.8477366255144032
test confusion matrix [[ 38  64]
 [ 10 206]]
test accuracy: 0.7672955974842768
test f1: 0.8477366255144032
test confusion matrix [[ 38  64]
 [ 10 206]]


In [7]:
text_clf.predict(pd.Series(['I got sick after going to Taco Bell']))

array([1])