In [262]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [233]:
#read dataset into dataframe
data = pd.read_json (r'pizza_request_dataset.json')

In [234]:
#split dataset into train and test sets
x_data = data.drop(['requester_received_pizza'],axis=1)
y_data = data['requester_received_pizza']
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_data, y_data, train_size = 0.90)

In [235]:
x_test_df.shape

(568, 32)

In [236]:
x_train_df.shape

(5103, 32)

In [249]:
y_train_df.shape

(5103,)

In [250]:
y_test_df.shape

(568,)

AttributeError: 'Series' object has no attribute 'columns'

In [237]:
#complete vocabulary
vocab = x_train_df['request_text']
vocab = vocab.append(x_test_df['request_text'])
vocab.shape

(5671,)

In [238]:
#find frequency of each unigram and bigram
unigrams = CountVectorizer(ngram_range=(1,1))
bigrams = CountVectorizer(ngram_range=(2,2))

uni_matrix = unigrams.fit_transform(vocab.values.astype('U'))
bi_matrix = bigrams.fit_transform(vocab.values.astype('U'))

In [239]:
uni_freq = sum(uni_matrix).toarray()[0]
uni_freq_df = pd.DataFrame(uni_freq, index=unigrams.get_feature_names(), columns=['frequency'])

bi_freq = sum(bi_matrix).toarray()[0]
bi_freq_df = pd.DataFrame(bi_freq, index=bigrams.get_feature_names(), columns=['frequency'])

In [240]:
#sort in descending order and select top 500
uni_freq_df = uni_freq_df.sort_values(['frequency'], ascending=[0])[:500]
bi_freq_df = bi_freq_df.sort_values(['frequency'], ascending=[0])[:500]

In [241]:
type(uni_freq_df)

pandas.core.frame.DataFrame

In [242]:
#make features
x_train = pd.DataFrame(np.full((x_train_df.shape[0], 1000), np.nan), 
                       columns = uni_freq_df.index.tolist() + bi_freq_df.index.tolist())
x_test = pd.DataFrame(np.full((x_test_df.shape[0], 1000), np.nan), 
                       columns = uni_freq_df.index.tolist() + bi_freq_df.index.tolist())

In [243]:
x_test.shape

(568, 1000)

In [251]:
x_train.shape

(5103, 1000)

In [244]:
for i in range(x_train_df.shape[0]):
    for j in range(uni_freq_df.shape[0]):
        x_train.iloc[i][uni_freq_df.index[j]] = str(x_train_df.iloc[i]['request_text']).count(uni_freq_df.index[j])
        x_train.iloc[i][bi_freq_df.index[j]] = str(x_train_df.iloc[i]['request_text']).count(bi_freq_df.index[j])

In [245]:
for i in range(x_test_df.shape[0]):
    for j in range(uni_freq_df.shape[0]):
        x_test.iloc[i][uni_freq_df.index[j]] = str(x_test_df.iloc[i]['request_text']).count(uni_freq_df.index[j])
        x_test.iloc[i][bi_freq_df.index[j]] = str(x_test_df.iloc[i]['request_text']).count(bi_freq_df.index[j])

In [183]:
x_test

Unnamed: 0,to,and,the,my,for,of,in,pizza,it,have,...,any food,and get,and some,to move,the best,if there,or anything,even if,now have,pizza it
0,1.0,2.0,0.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,1.0,4.0,0.0,1.0,1.0,5.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2.0,3.0,2.0,3.0,2.0,5.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,3.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9.0,6.0,8.0,2.0,3.0,4.0,16.0,1.0,6.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,7.0,1.0,0.0,5.0,11.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5.0,1.0,7.0,0.0,2.0,2.0,8.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
#make and fit SVM classifier
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train_df)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [259]:
#make predictions
y_pred = model_svm.predict(x_test)
len(y_pred)

568

In [260]:
#metrics
print(classification_report(y_test_df, y_pred))

              precision    recall  f1-score   support

       False       0.77      0.86      0.81       429
        True       0.31      0.20      0.25       139

    accuracy                           0.70       568
   macro avg       0.54      0.53      0.53       568
weighted avg       0.66      0.70      0.67       568



In [261]:
y_pred

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False,  True, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False, False, False,  True,  True, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False,

In [263]:
roc_auc_score(y_test_df, y_pred)

0.5296238533648606

In [264]:
f1_score(y_test_df, y_pred)

0.2456140350877193