In [1]:
%matplotlib inline
import csv
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import json
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
sns.set(style="darkgrid")
import gensim
from gensim.models import Phrases, KeyedVectors, Word2Vec
import gensim.downloader
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

## Problem Introduction

We have a dataset in JSON format containing information about the user activity in a reddit forum **ROAP**. Our feature set comes from a bunch of user features noted at the time of posting user comments in the forun alogside the titel and text they chose. The response variable is whether the requester received the pizza or not.

## Basic data exploration

### 1. Loading the full dataset

In [2]:
with open('../data/random-acts-of-pizza/train.json') as f:
    train_json_data = json.load(f)

df = pd.io.json.json_normalize(train_json_data) 
df.head()

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,False,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317853000.0,1317849000.0
1,,2,5,False,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652000.0,1332649000.0
2,,0,3,False,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,...,False,[],0,3,0,3,,jacquibatman7,1319650000.0,1319646000.0
3,,0,1,True,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855000.0,1322855000.0
4,,6,6,False,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373658000.0,1373654000.0


In [4]:
print('Total data rows in full training set: {}'.format(df.shape[0]))
print('Total data columns in full training set: {}'.format(df.shape[1]))

Total data rows in full training set: 4040
Total data columns in full training set: 32


### 2. Splitting training set into training and validation

As we don't have labeled test set, we have to split existing training data into training and validation set in order to evaluate our model performance. We have already split the train and validation in 90/10 ratio and saved as csv files. We read them directly in memory.

In [6]:
train = pd.read_csv("../prakhar/train.csv")
val = pd.read_csv("../prakhar/validation.csv")

In [7]:
print(train.shape)
print(val.shape)

(3636, 32)
(404, 32)


### 3. Extracting relevant columns (which are presnt in test data) 

In [5]:
relevant_cols = ['request_title', 'request_text_edit_aware' ,
                 'requester_number_of_posts_on_raop_at_request', 
                'requester_number_of_subreddits_at_request']

text_cols = ['request_title','request_text_edit_aware']

y_col = 'requester_received_pizza'

In [8]:
train = train[[y_col] + relevant_cols]
val = val[[y_col] + relevant_cols]

In [9]:
print(train.shape)
print(val.shape)

(3636, 5)
(404, 5)


### 4. Exploration of 