# Random Acts of Pizza
Leveraging data from the Kaggle Competition (unsubmitted): https://www.kaggle.com/c/random-acts-of-pizza/data. <br>
Joint effort between Sarah Xie, Carolina Lee, and Ben O'Neil during Summer 2021 semester.

In [1]:
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import *
from sklearn import metrics
import warnings as warn

from scripts.length_and_date_engineering import *
from scripts.narratives import narr_assign
from scripts.politeness_and_image_engineering import *
from scripts.sentiment_and_objectivity import *
from scripts.top_words_usage import *

In [2]:
warn.filterwarnings("ignore")

In [3]:
train_df = pd.read_json("data/random-acts-of-pizza/train.json")
test_df = pd.read_json("data/random-acts-of-pizza/test.json")

## Train-test split

In [4]:
X = train_df #.drop("requester_received_pizza", axis=1)
y = train_df["requester_received_pizza"]
train_df, test_df, train_label, test_label = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

## Data cleaning
Store it all in a function so it can be easily applied to both the training and test sets (as well as the test set, at the end of the modeling phase).

In [5]:
def clean_training_data(input_data):
    """Wrapper function that applies all data cleaning & feature engineering steps to the TRAIN dataset."""
    df = input_data.copy()
    
    # combine and standardize title and text in train data
    df["title_and_request"] = df["request_text_edit_aware"] + " " + df["request_title"]
    df["title_and_request"] = df["title_and_request"].str.lower().str.strip()

    # leverage the featuring engineering methods created on train data
    df = create_length_of_post_and_title(df)
    df = create_title_pentagram(df)
    df = create_date_features(df)
    df = create_deltas(df)
    df["requester_user_flair"] = df["requester_user_flair"].replace(np.nan, "None")
    df = sentiment(df)
    df = subjectivity(df)
    df = has_text_binary_column_creator(df, image_text, "has_image")
    df = has_text_binary_column_creator(df,reciprocity_text,"has_reciprocity")
    df = has_text_binary_column_creator(df,polite_text,"has_polite")
    df = narr_assign(df)
    df, top_words = key_words_usage(df)
    
    # subset columns
    COLS_TO_KEEP = ["request_number_of_comments_at_retrieval",
                    "requester_account_age_in_days_at_request",
                    "change_in_number_requester_comments",
                    "change_in_number_requester_comments_raop",
                    "change_in_number_requester_posts",
                    "change_in_number_requester_posts_raop",
                    "change_in_requester_vote_status", 
                    "change_in_requester_engagement",
                    "total_request_votes_at_retrieval",
                    "percent_request_upvotes_at_retrieval",
                    "requester_days_since_first_post_on_raop_at_request",
                    "requester_number_of_subreddits_at_request", 
                    "length_of_original_post",
                    "length_of_title", "day_of_week", "month_of_year", "time_bucket",
                    "sentiment", "subjectivity", "has_image", "has_reciprocity",
                    "has_polite", "money_check", "job_check",
                    "student_check", "family_check", "craving_check", "key_words_usage"]
    
    df = df[COLS_TO_KEEP]
    
    # one-hot encode the categorical features
    OHE_COLS = ["day_of_week", "month_of_year", "time_bucket"]
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    ohe.fit(df[OHE_COLS])
    transformed_df = pd.DataFrame(ohe.transform(df[OHE_COLS]),
                                  columns=ohe.get_feature_names(input_features=OHE_COLS))
    
    df = df.drop(OHE_COLS, axis=1).reset_index(drop=True)
    df = pd.concat([df, transformed_df], axis=1)

    df = df.replace(np.nan, 0)
    
    return df, top_words, ohe

In [6]:
def clean_test_data(input_data, top_word_list, one_hot_encoder):
    """Wrapper function that applies all data cleaning & feature engineering steps to the TEST dataset."""
    df = input_data.copy()
    
    # combine and standardize title and text in train data
    df["title_and_request"] = df["request_text_edit_aware"] + " " + df["request_title"]
    df["title_and_request"] = df["title_and_request"].str.lower().str.strip()

    # leverage the featuring engineering methods created on train data
    df = create_length_of_post_and_title(df)
    df = create_title_pentagram(df)
    df = create_date_features(df)
    df = create_deltas(df)
    df["requester_user_flair"] = df["requester_user_flair"].replace(np.nan, "None")
    df = sentiment(df)
    df = subjectivity(df)
    df = has_text_binary_column_creator(df, image_text, "has_image")
    df = has_text_binary_column_creator(df,reciprocity_text,"has_reciprocity")
    df = has_text_binary_column_creator(df,polite_text,"has_polite")
    df = narr_assign(df)
    
    # key words usage
    top_features = [w[0] for w in top_word_list]
    
    def wordCounterUsage(text):
        count_of_key_words = 0
        for i in top_features:
            in_text = 1 if i in text else 0
            count_of_key_words += in_text
        return count_of_key_words/len(top_features)

    df['key_words_usage'] = [wordCounterUsage(text) for text in df['title_and_request']]
    
    # subset columns
    COLS_TO_KEEP = ["request_number_of_comments_at_retrieval",
                    "requester_account_age_in_days_at_request",
                    "change_in_number_requester_comments",
                    "change_in_number_requester_comments_raop",
                    "change_in_number_requester_posts",
                    "change_in_number_requester_posts_raop",
                    "change_in_requester_vote_status", 
                    "change_in_requester_engagement",
                    "total_request_votes_at_retrieval",
                    "percent_request_upvotes_at_retrieval",
                    "requester_days_since_first_post_on_raop_at_request",
                    "requester_number_of_subreddits_at_request", 
                    "length_of_original_post",
                    "length_of_title", "day_of_week", "month_of_year", "time_bucket",
                    "sentiment", "subjectivity", "has_image", "has_reciprocity",
                    "has_polite", "money_check", "job_check",
                    "student_check", "family_check", "craving_check", "key_words_usage"]
    
    df = df[COLS_TO_KEEP]
    
    # one-hot encode the categorical features
    OHE_COLS = ["day_of_week", "month_of_year", "time_bucket"]
    transformed_df = pd.DataFrame(one_hot_encoder.transform(df[OHE_COLS]),
                                  columns=ohe.get_feature_names(input_features=OHE_COLS))
    
    df = df.drop(OHE_COLS, axis=1).reset_index(drop=True)
    df = pd.concat([df, transformed_df], axis=1)

    df = df.replace(np.nan, 0)
    
    return df

In [7]:
# apply data cleaning to steps above
train_df, top_words, ohe = clean_training_data(train_df)
train_label = train_label * 1
test_df = clean_test_data(test_df, top_words, ohe)
test_label = test_label * 1

In [9]:
# implement standard scaler
COLS_TO_SCALE = ['request_number_of_comments_at_retrieval',
       'requester_account_age_in_days_at_request',
       'change_in_number_requester_comments',
       'change_in_number_requester_comments_raop',
       'change_in_number_requester_posts',
       'change_in_number_requester_posts_raop',
       'change_in_requester_vote_status', 'change_in_requester_engagement',
       'total_request_votes_at_retrieval',
       'percent_request_upvotes_at_retrieval',
       'requester_days_since_first_post_on_raop_at_request',
       'requester_number_of_subreddits_at_request', 'length_of_original_post',
       'length_of_title', 'sentiment', 'key_words_usage']

scaler = StandardScaler()
scaler.fit(train_df[COLS_TO_SCALE])
transformed_data = scaler.transform(train_df[COLS_TO_SCALE])
scaled_train_df = train_df.copy()
scaled_train_df[COLS_TO_SCALE] = transformed_data.copy()

In [10]:
scaled_test_df = test_df.copy()
transformed_data = scaler.transform(test_df[COLS_TO_SCALE])
scaled_test_df[COLS_TO_SCALE] = transformed_data.copy()

## Baseline Model

In [13]:
# logistic regression for baseline
log_model = LogisticRegression()
log_model.fit(scaled_train_df, train_label)
y_pred = log_model.predict(scaled_test_df)
y_prob = log_model.predict_proba(scaled_test_df)
y_prob = y_prob.transpose()[0]

# print accuracy
print(f"Accuracy: {log_model.score(scaled_test_df, test_label)} \n")

# print ROC
print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")

# confusion matrix
print(f"Confusion matrix:\n {confusion_matrix(test_label, y_pred)}")

Accuracy: 0.8353960396039604 

ROC AUC: 0.14963982473946086 

Confusion matrix:
 [[570  39]
 [ 94 105]]


## Additional Modeling
Improve upon the baseline model

_This is the place in the notebook where team members may diverge. Ben and Sarah will be trying different model types to improve upon the baseline model._

### Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

In [16]:
# might need to scale the data, can use StandardScaler()
rf_model = RandomForestClassifier(max_depth=100)
rf_model.fit(train_df, train_label)
y_pred = rf_model.predict(test_df)
y_prob = rf_model.predict_proba(test_df)
y_prob = y_prob.transpose()[0]

# print accuracy
print(f"Accuracy: {rf_model.score(test_df, test_label)} \n")

# print ROC
print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")

# confusion matrix
print(f"Confusion matrix:\n {confusion_matrix(test_label, y_pred)}")

Accuracy: 0.8465346534653465 

ROC AUC: 0.1237674414766773 

Confusion matrix:
 [[585  24]
 [100  99]]


In [18]:
# inspect feature importance
feature_importance = pd.DataFrame({"features": train_df.columns,
                                   "importance": rf_model.feature_importances_})
feature_importance.sort_values("importance", ascending=False).head(15)

Unnamed: 0,features,importance
5,change_in_number_requester_posts_raop,0.168032
0,request_number_of_comments_at_retrieval,0.096129
3,change_in_number_requester_comments_raop,0.070342
6,change_in_requester_vote_status,0.052847
7,change_in_requester_engagement,0.049944
12,length_of_original_post,0.048789
2,change_in_number_requester_comments,0.044091
13,length_of_title,0.043733
9,percent_request_upvotes_at_retrieval,0.040254
24,key_words_usage,0.039781


In [20]:
# try random forest again with subset of features
SUBSET_COLS = feature_importance.sort_values("importance", ascending=False).head(10)["features"].tolist()

rf_model = RandomForestClassifier(max_depth=100)
rf_model.fit(train_df[SUBSET_COLS], train_label)
y_pred = rf_model.predict(test_df[SUBSET_COLS])
y_prob = rf_model.predict_proba(test_df[SUBSET_COLS])
y_prob = y_prob.transpose()[0]

# print ROC
print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")
# confusion matrix
print(f"Confusion matrix:\n {confusion_matrix(test_label, y_pred)}")

ROC AUC: 0.1529569027403025 

Confusion matrix:
 [[575  34]
 [ 97 102]]


### XGBoost?

In [21]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [22]:
gb_model = GBC(learning_rate=1, max_depth=5)
gb_model.fit(train_df, train_label)
y_prob = gb_model.predict_proba(test_df)
y_prob = y_prob.transpose()[0]

print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC: 0.17393205766104744 



In [23]:
gb_model = GBC(learning_rate=1, max_depth=5)
gb_model.fit(train_df[SUBSET_COLS], train_label)
y_prob = gb_model.predict_proba(test_df[SUBSET_COLS])
y_prob = y_prob.transpose()[0]

print(f"ROC AUC with SUBSET_COLS: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC with SUBSET_COLS: 0.2119794374169699 



### K Nearest Neighbors
Second-best result against the Logistic Regression (baseline)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_df, train_label)
y_prob = knn_model.predict_proba(test_df)
y_prob = y_prob.transpose()[0]

print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC: 0.3796610309346403 



In [26]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_df[SUBSET_COLS], train_label)
y_prob = knn_model.predict_proba(test_df[SUBSET_COLS])
y_prob = y_prob.transpose()[0]

print(f"ROC AUC with SUBSET_COLS: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC with SUBSET_COLS: 0.36940036801412646 



### Gaussian Naive Bayes
Best performance so far

In [27]:
from sklearn.naive_bayes import GaussianNB

In [28]:
nb_model = GaussianNB(var_smoothing=0.06)
nb_model.fit(train_df, train_label)
y_prob = nb_model.predict_proba(test_df)
y_prob = y_prob.transpose()[0]

print(f"ROC AUC: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC: 0.502652837256892 



In [29]:
nb_model = GaussianNB(var_smoothing=0.06)
nb_model.fit(train_df[SUBSET_COLS], train_label)
y_prob = nb_model.predict_proba(test_df[SUBSET_COLS])
y_prob = y_prob.transpose()[0]

print(f"ROC AUC with SUBSET_COLS: {roc_auc_score(test_label, y_prob)} \n")

ROC AUC with SUBSET_COLS: 0.5058131379392858 

