In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import re

In [16]:
#read dataset into dataframe
data = pd.read_json (r'pizza_request_dataset.json')

In [17]:
#separate x_data and y_data
x_data_df = data['request_text']
y_data = data['requester_received_pizza']

In [18]:
#read the narratives
f = open('../resources/narratives/desire.txt')
desire = f.read().split('\n')
f.close()

f = open('../resources/narratives/family.txt')
family = f.read().split('\n')
f.close()

f = open('../resources/narratives/job.txt')
job = f.read().split('\n')
f.close()

f = open('../resources/narratives/money.txt')
money = f.read().split('\n')
f.close()

f = open('../resources/narratives/student.txt')
student = f.read().split('\n')
f.close()

In [19]:
#make features
x_data = pd.DataFrame(np.full((x_data_df.shape[0], 5), np.nan),
                     columns = ['desire', 'family', 'job', 'money', 'student'])

In [20]:
for i in range(x_data.shape[0]):
    num_words = len(re.findall(r'\w+', x_data_df.iloc[i])) 
    sum=0
    for word in desire:
        sum += len(re.findall(word,x_data_df.iloc[i]))
    if num_words==0:
        x_data.iloc[i][0] = 0.0
    else:
        x_data.iloc[i][0] = sum/float(num_words)
    
    sum=0
    for word in family:
        sum += len(re.findall(word,x_data_df.iloc[i]))
    if num_words==0:
        x_data.iloc[i][1] = 0.0
    else:
        x_data.iloc[i][1] = sum/float(num_words)
        
    sum=0
    for word in job:
        sum += len(re.findall(word,x_data_df.iloc[i]))
    if num_words==0:
        x_data.iloc[i][2] = 0.0
    else:
        x_data.iloc[i][2] = sum/float(num_words)
        
    sum=0
    for word in money:
        sum += len(re.findall(word,x_data_df.iloc[i]))
    if num_words==0:
        x_data.iloc[i][3] = 0.0
    else:
        x_data.iloc[i][3] = sum/float(num_words)
        
    sum=0
    for word in student:
        sum += len(re.findall(word,x_data_df.iloc[i]))
    if num_words==0:
        x_data.iloc[i][4] = 0.0
    else:
        x_data.iloc[i][4] = sum/float(num_words)

In [21]:
#normalize
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

In [22]:
#divide into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size = 0.90, random_state=100)

In [23]:
#make a svm classifier and train it
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [24]:
#make predictions
y_pred = model_svm.predict(x_test)

In [25]:
#metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.76      1.00      0.87       434
        True       0.00      0.00      0.00       134

    accuracy                           0.76       568
   macro avg       0.38      0.50      0.43       568
weighted avg       0.58      0.76      0.66       568



  'precision', 'predicted', average, warn_for)


In [26]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(0.5838251339020036, 0.7640845070422535, 0.6619015490146468, None)

In [27]:
roc_auc_score(y_test, y_pred)

0.5

In [28]:
f1_score(y_test, y_pred)

  'precision', 'predicted', average, warn_for)


0.0