In [1]:
import sys
sys.path.append("./data/")
from read_dataset import read_dataset
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pickle as pkl
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [2]:
train_set = pkl.load(open("train_set.pkl", "rb"))
test_set = pkl.load(open('test_set.pkl', "rb"))
num_train = len(train_set)

In [3]:
y_train = [1 if post['requester_received_pizza'] else 0 for post in train_set]
y_test = [1 if post['requester_received_pizza'] else 0 for post in test_set]

In [4]:
corpus = []
for i in range(len(train_set)):
    corpus.append(train_set[i]['request_text'])
for i in range(len(test_set)):
    corpus.append(test_set[i]['request_text'])

In [5]:
uni_vec = CountVectorizer(ngram_range=(1, 1))
bi_vec = CountVectorizer(ngram_range=(2,2))

In [6]:
X1 = uni_vec.fit_transform(corpus).toarray()
X2 = bi_vec.fit_transform(corpus).toarray()

In [7]:
uni_feat_names = uni_vec.get_feature_names()
bi_feat_names = bi_vec.get_feature_names() 

In [8]:
def get_names(names):
    ret = []
    for idx in ind:
        ret.append(names[idx])
    return ret

In [9]:
def get_top_features(X, num_train):
    X_train = X[:num_train]
    X_test = X[num_train:]
    freq = X_train.sum(axis=0)
    top_ind = np.argsort(freq)[::-1][:500]
    X_train = X_train[:,top_ind]
    X_test = X_test[:,top_ind]
    return X_train, X_test, top_ind

In [10]:
X1_train, X1_test, _ = get_top_features(X1, num_train)
X2_train, X2_test, _ = get_top_features(X2, num_train)
X_train = np.hstack((X1_train, X2_train))
X_test = np.hstack((X1_test, X2_test))
X_train= preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

In [11]:
X_train.shape, X_test.shape

((5104, 1000), (567, 1000))

In [12]:
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [13]:
y_pred = clf.predict(X_test)

In [14]:
clf.score(X_test, y_test)

0.7354497354497355

In [15]:
accuracy_score(y_test, y_pred)

0.7354497354497355

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       417
           1       0.00      0.00      0.00       150

    accuracy                           0.74       567
   macro avg       0.37      0.50      0.42       567
weighted avg       0.54      0.74      0.62       567



  'precision', 'predicted', average, warn_for)


In [17]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(0.5408863133730859, 0.7354497354497355, 0.6233384952897149, None)

In [18]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,