In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import re

In [18]:
#read dataset into dataframe
data = pd.read_json (r'pizza_request_dataset.json')

In [19]:
#separate x_data and y_data
x_data_df = data['request_text']
y_data = data['requester_received_pizza']

In [20]:
#read .dic file
f = open("../resources/MoralFoundations.dic", "r")
lines = f.read().split('\n')
f.close()

In [21]:
#make dictionary based on category
moral_dict = {}
for line in lines:
    elements = line.split()
    if len(elements):
        if elements[0].isnumeric() == False:
            word = elements[0]
            elements.remove(word)
            dimensions = elements
            
            if word[-1] == "*":
                word = word[:len(word)-1] + '.*'
            
            for i in range(len(dimensions)):
                if dimensions[i] not in moral_dict:
                    moral_dict[dimensions[i]] = [word]
                else:
                    moral_dict[dimensions[i]].append(word)

In [22]:
#make features
x_data = pd.DataFrame(np.full((x_data_df.shape[0], len(list(moral_dict.keys()))), np.nan),
                     columns = list(moral_dict.keys()))

In [23]:
for i in range(x_data.shape[0]):
    num_words = len(re.findall(r'\w+', x_data_df.iloc[i])) 
    
    for key,item in moral_dict.items():
        sum=0
        for word in item:
            sum += len(re.findall(word,x_data_df.iloc[i]))
        if num_words==0:
            x_data.iloc[i][key] = 0.0
        else:
            x_data.iloc[i][key] = sum/float(num_words)

In [24]:
#normalize
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

In [25]:
#divide into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size = 0.90, random_state=100)

In [26]:
#make a svm classifier and train it
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
#make predictions
y_pred = model_svm.predict(x_test)

In [28]:
#metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.76      1.00      0.87       434
        True       0.00      0.00      0.00       134

    accuracy                           0.76       568
   macro avg       0.38      0.50      0.43       568
weighted avg       0.58      0.76      0.66       568



  'precision', 'predicted', average, warn_for)


In [30]:
roc_auc_score(y_test, y_pred)

0.5