In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [2]:
#read dataset into dataframe
data = pd.read_json (r'pizza_request_dataset.json')

In [3]:
#split dataset into train and test sets
x_data = data.drop(['requester_received_pizza'],axis=1)
y_data = data['requester_received_pizza']
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_data, y_data, train_size = 0.90, random_state = 100)

In [4]:
#complete vocabulary
vocab = x_train_df['request_text']
vocab = vocab.append(x_test_df['request_text'])

In [5]:
#find frequency of each unigram and bigram
unigrams = CountVectorizer(ngram_range=(1,1))
bigrams = CountVectorizer(ngram_range=(2,2))

uni_matrix = unigrams.fit_transform(vocab.values.astype('U'))
bi_matrix = bigrams.fit_transform(vocab.values.astype('U'))

In [6]:
uni_freq = sum(uni_matrix).toarray()[0]
uni_freq_df = pd.DataFrame(uni_freq, index=unigrams.get_feature_names(), columns=['frequency'])

bi_freq = sum(bi_matrix).toarray()[0]
bi_freq_df = pd.DataFrame(bi_freq, index=bigrams.get_feature_names(), columns=['frequency'])

In [7]:
#sort in descending order and select top 500
uni_freq_df = uni_freq_df.sort_values(['frequency'], ascending=[0])[:500]
bi_freq_df = bi_freq_df.sort_values(['frequency'], ascending=[0])[:500]

In [8]:
#make features
x_train = pd.DataFrame(np.full((x_train_df.shape[0], 1000), np.nan), 
                       columns = uni_freq_df.index.tolist() + bi_freq_df.index.tolist())
x_test = pd.DataFrame(np.full((x_test_df.shape[0], 1000), np.nan), 
                       columns = uni_freq_df.index.tolist() + bi_freq_df.index.tolist())

In [9]:
for i in range(x_train_df.shape[0]):
    for j in range(uni_freq_df.shape[0]):
        x_train.iloc[i][uni_freq_df.index[j]] = str(x_train_df.iloc[i]['request_text']).count(uni_freq_df.index[j])
        x_train.iloc[i][bi_freq_df.index[j]] = str(x_train_df.iloc[i]['request_text']).count(bi_freq_df.index[j])

In [10]:
for i in range(x_test_df.shape[0]):
    for j in range(uni_freq_df.shape[0]):
        x_test.iloc[i][uni_freq_df.index[j]] = str(x_test_df.iloc[i]['request_text']).count(uni_freq_df.index[j])
        x_test.iloc[i][bi_freq_df.index[j]] = str(x_test_df.iloc[i]['request_text']).count(bi_freq_df.index[j])

In [11]:
#make and fit SVM classifier
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train_df)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
#make predictions
y_pred = model_svm.predict(x_test)

In [13]:
#metrics
print(classification_report(y_test_df, y_pred))

              precision    recall  f1-score   support

       False       0.80      0.88      0.83       434
        True       0.41      0.28      0.33       134

    accuracy                           0.73       568
   macro avg       0.60      0.58      0.58       568
weighted avg       0.70      0.73      0.72       568



In [14]:
roc_auc_score(y_test_df, y_pred)

0.575847719925717

In [18]:
f1_score(y_test_df, y_pred)

0.3288888888888889

In [20]:
precision_recall_fscore_support(y_test_df, y_pred, average='weighted')

(0.7046263226837237, 0.7341549295774648, 0.7150260165702103, None)