In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [33]:
#read dataset into dataframe
data = pd.read_json (r'pizza_request_dataset.json')

In [34]:
#separate x_data and y_data
x_data = data.drop(['requester_received_pizza'],axis=1)
y_data = data['requester_received_pizza']

In [35]:
#make features
x_data_df = x_data[['post_was_edited','requester_account_age_in_days_at_request','requester_account_age_in_days_at_retrieval',
                'requester_days_since_first_post_on_raop_at_request','requester_days_since_first_post_on_raop_at_retrieval',
                'requester_number_of_comments_at_request','requester_number_of_comments_at_retrieval',
                'requester_number_of_comments_in_raop_at_request','requester_number_of_comments_in_raop_at_retrieval',
                'requester_number_of_posts_at_request','requester_number_of_posts_at_retrieval',
                'requester_number_of_posts_on_raop_at_request','requester_number_of_posts_on_raop_at_retrieval',
                'requester_number_of_subreddits_at_request','number_of_downvotes_of_request_at_retrieval',
                'number_of_upvotes_of_request_at_retrieval','requester_upvotes_minus_downvotes_at_request',
                'requester_upvotes_minus_downvotes_at_retrieval','requester_upvotes_plus_downvotes_at_request',
                'requester_upvotes_plus_downvotes_at_retrieval','requester_user_flair']]

In [36]:
#feature for requester_subreddits_at_request
#selecting number of subreddits as feature
x_data_df['num_requester_subreddits_at_request'] = x_data['requester_subreddits_at_request'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
#feature for requester_user_flair
x_data_df = pd.get_dummies(x_data_df,prefix=['requester_user_flair'])

In [38]:
#normalize
scaler = StandardScaler()
x_data_df = scaler.fit_transform(x_data_df)

In [39]:
#divide into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_df, y_data, train_size = 0.90, random_state=100)

In [40]:
#make a svm classifier and train it
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [41]:
#make predictions
y_pred = model_svm.predict(x_test)

In [42]:
#metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       426
        True       1.00      1.00      1.00       142

    accuracy                           1.00       568
   macro avg       1.00      1.00      1.00       568
weighted avg       1.00      1.00      1.00       568



In [43]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(1.0, 1.0, 1.0, None)

In [44]:
roc_auc_score(y_test, y_pred)

1.0

In [45]:
f1_score(y_test, y_pred)

1.0