In [7]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

import json 
import pandas as pd 
from pandas import json_normalize

In [8]:
#Load training data
with open('../data/random-acts-of-pizza/train.json') as f:
    train_json = json.load(f)
    
# Load test data
with open('../data/random-acts-of-pizza/test.json') as f:
    test_json = json.load(f)

train = json_normalize(train_json)
test = json_normalize(test_json)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)


Train Shape: (4040, 32)
Test Shape: (1631, 17)


In [9]:
train.head()

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,False,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317853000.0,1317849000.0
1,,2,5,False,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652000.0,1332649000.0
2,,0,3,False,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,...,False,[],0,3,0,3,,jacquibatman7,1319650000.0,1319646000.0
3,,0,1,True,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855000.0,1322855000.0
4,,6,6,False,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373658000.0,1373654000.0


In [10]:
train_data = train[:2800]
train_labels = train[:2800]['requester_received_pizza']
test_data = test[:]

dev_data = train[2800:]
dev_labels = train[2800:]['requester_received_pizza']

print('training label shape:', train_labels.shape)
print('dev label shape:',      dev_labels.shape)
print('training data shape:', train_data.shape)
print('test data shape:',      test_data.shape)
print('dev data shape:',     dev_data.shape)

training label shape: (2800,)
dev label shape: (1240,)
training data shape: (2800, 32)
test data shape: (1631, 17)
dev data shape: (1240, 32)


### Examine Data
---

 1. For first 5 training examples, print the title of request

In [11]:
def display_request(num_examples=5):

    for i in range(num_examples):
        print(train_data.iloc[i]['request_title']) 
        print(train_data.iloc[i]['request_text'])
        print('Received Pizza: ', train_data.iloc[i]['requester_received_pizza']) 
        print('\n')  

        
display_request(5)

Request Colorado Springs Help Us Please
Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
Received Pizza:  False


[Request] California, No cash and I could use some dinner
I spent the last money I had on gas today. Im broke until next Thursday :(
Received Pizza:  False


[Request] Hungry couple in Dundee, Scotland would love some pizza!
My girlfriend decided it would be a good idea to get off at Perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in Dundee. Any chance some kind soul would get us some pizza since we don't have any cash anymore?
Received Pizza:  False


[Request] In Canada (Ontario), just got home from school. Need pizza.
It's cold, I'n hungr

In [12]:
print('*** 1. Unigram Feature Vectors ***')
title_train_data = train_data['request_title']
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(title_train_data)

#What is the size of the vocabulary?
print('Size of vocabulary: ', vector.shape[1])
print("0th feature: ", vectorizer.get_feature_names()[0])
print("Last feature: ", vectorizer.get_feature_names()[-1])
print("\n")


vectorizer2 = CountVectorizer(ngram_range=(2, 2))
vector2 = vectorizer2.fit_transform(title_train_data)
print('Size of vocabulary: ', vector2.shape[1])
print("0th feature: ", vectorizer2.get_feature_names()[0])
print("Last feature: ", vectorizer2.get_feature_names()[-1])


print('\n*** 4. Prune words that appear in fewer than 10 documents ***')
vectorizer = CountVectorizer(min_df=10)
vector = vectorizer.fit_transform(title_train_data)
print('Size of vocabulary: ', vector.shape[1])
print("0th feature: ", vectorizer.get_feature_names()[0])
print("Last feature: ", vectorizer.get_feature_names()[-1])
vocab = vectorizer.vocabulary_ 

print("\n")

*** 1. Unigram Feature Vectors ***
Size of vocabulary:  3672
0th feature:  000
Last feature:  ಠ_ಠ


Size of vocabulary:  16072
0th feature:  000 bucks
Last feature:  zucchini and

*** 4. Prune words that appear in fewer than 10 documents ***
Size of vocabulary:  441
0th feature:  able
Last feature:  your




In [31]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def text_preprocessor(text):
    text = text.lower()
    text = re.sub("\W",' ',text) # replace non-alphanumeric
    text = re.sub("_",' ',text) # replace non-alphanumeric
    text = re.sub('\n', '', text)   
    return text

print('\n***  Naive Bayes models ***') 
title_train_data = train_data['request_title']
title_dev_data = dev_data['request_title']
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', preprocessor=text_preprocessor)
train_vector = vectorizer.fit_transform(title_train_data)

dev_vector = vectorizer.transform(title_dev_data)

mnb = MultinomialNB(alpha=0.01)
mnb.fit(train_vector, train_labels)

#Evaluate performance on the dev set.
pred_mnb = mnb.predict(dev_vector)
score = metrics.f1_score(dev_labels, pred_mnb, average="weighted")
print(f"f1 score = {score}")

for i, row in enumerate(pred_mnb):
    if row==True:
        print(dev_data.iloc[i]['request_title'], dev_data.iloc[i]['requester_received_pizza'] )  





***  Naive Bayes models ***
f1 score = 0.6460174723351423
[Request] I'm a week away from payday and would love something other than rice and beans. Will return the favor, plus extra after Wednesday! False
[Request] Check has not cleared yet. North hollywood  False
[Request] I'm tired of rice and ramen and a little pizza would really brighten my mood. (St. Petersburg, FL) False
[request] Its been one hell of a month False
[Request] Saint Petersburg, FL: New in Town and Jobless True
[REQUEST]Living Alone and in Abject Poverty False
(Request) if you can give a pizza please read True
[REQUEST] Sydney Australia - Having a really shitty week, and literally no money. False
[Request] Beer Craft Pizza False
[REQUEST] After a long day of moving False
[Request] Need food for starving students! False
[Request] Car issues, Freezing Cold and a Empty wallet.  False
[Request] Rough week at school with my suite-mates False
[REQUEST] False
[REQUEST]Poor high schooler, pizza is half price because i work

In [32]:
title_train_data = train_data['request_text']
title_dev_data = dev_data['request_text']
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english', preprocessor=text_preprocessor)
train_vector = vectorizer.fit_transform(title_train_data)

dev_vector = vectorizer.transform(title_dev_data)

lr_processed = LogisticRegression(C=0.5, solver="liblinear", multi_class="auto")

lr_processed.fit(train_vector, train_labels)

pred = lr_processed.predict(dev_vector) #evaluate on transformed dev data

score_processed = metrics.f1_score(dev_labels, pred, average="weighted")
print(f"f1 score = {score_processed}")

for i, row in enumerate(pred):
    if row==True:
        print(dev_data.iloc[i]['request_title'], dev_data.iloc[i]['requester_received_pizza'] )  

f1 score = 0.7003223691994663
[Request] I'm a week away from payday and would love something other than rice and beans. Will return the favor, plus extra after Wednesday! False
[Request] Bronx, New York decided to quit heroin yesterday, I think tonight i'll be able to eat again (with a little help!) True
[Request] I'm tired of rice and ramen and a little pizza would really brighten my mood. (St. Petersburg, FL) False
[request] just hungry, new baby True
(Request) Athens, GA - College student in a bind - will pay forward True
[Request] Panama City, FL USA Family of 5 False
[Request] Columbia, SC False
[Request] Anyone willing to do a trade? False
[Request] Snowed in by a Canadian winter storm. False
Request Pizza For Mercedes! False
[REQUEST] Austin, TX  False
[Request][Califorina] Wife and I hungry on our first anniversary!  Will pay it forward! True
[Request] Hopeful guy that wants to end a long week on a good note -West Point, MS False
[Request] Could use a pie in Seattle. False
[Req

In [104]:
dev_data.iloc[31]

giver_username_if_known                                                                               N/A
number_of_downvotes_of_request_at_retrieval                                                             0
number_of_upvotes_of_request_at_retrieval                                                               1
post_was_edited                                                                                     False
request_id                                                                                       t3_pk4tr
request_number_of_comments_at_retrieval                                                                 1
request_text                                            Hey guys so my Jetta's engine light came on to...
request_text_edit_aware                                 Hey guys so my Jetta's engine light came on to...
request_title                                           [Request] Car broke, no food at house and star...
requester_account_age_in_days_at_request      

In [82]:
# Understand nature of the data .info() .describe()
# Histograms and boxplots 
# Value counts 
# Missing data 
# Correlation between the metrics 
# Explore interesting themes 
    # Wealthy survive? 
    # By location 
    # Age scatterplot with ticket price 
    # Young and wealthy Variable? 
    # Total spent? 
# Feature engineering 
# preprocess data together or use a transformer? 
    # use label for train and test   
# Scaling?

# Model Baseline 
# Model comparison with CV 

In [9]:
train.describe()

Unnamed: 0,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,request_number_of_comments_at_retrieval,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,...,requester_number_of_posts_at_retrieval,requester_number_of_posts_on_raop_at_request,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,unix_timestamp_of_request,unix_timestamp_of_request_utc
count,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,...,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0
mean,2.424505,6.180446,2.87104,254.586579,757.69272,16.417034,518.993205,115.098267,289.425743,0.64505,...,41.151733,0.063614,1.239109,18.076733,1160.07995,2720.342079,3743.236,7788.069,1342829000.0,1342826000.0
std,3.023101,10.74632,4.723339,303.27573,333.035728,70.651428,267.872623,193.318968,357.416133,3.413813,...,80.798543,0.325773,0.603083,21.736465,3718.365515,6264.378878,25838.16,39167.41,23330570.0,23329890.0
min,0.0,0.0,0.0,0.0,45.291562,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-173.0,-173.0,0.0,0.0,1297723000.0,1297723000.0
25%,1.0,2.0,0.0,3.473168,522.248455,0.0,279.009051,0.0,8.0,0.0,...,2.0,0.0,1.0,1.0,3.0,22.0,9.0,52.0,1320469000.0,1320466000.0
50%,2.0,4.0,1.0,157.06717,753.270874,0.0,528.781939,24.0,114.0,0.0,...,13.0,0.0,1.0,11.0,174.5,708.0,351.0,1283.5,1342565000.0,1342561000.0
75%,3.0,7.0,4.0,390.092653,900.349838,0.0,776.22667,140.25,479.0,0.0,...,46.0,0.0,1.0,27.0,1163.75,3304.0,2303.75,6829.0,1364618000.0,1364614000.0
max,47.0,345.0,61.0,2809.750787,2879.276319,785.457685,1025.407593,994.0,1000.0,88.0,...,999.0,5.0,9.0,186.0,155010.0,223708.0,1286864.0,2046482.0,1381552000.0,1381523000.0


In [3]:
train.head()

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,False,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317853000.0,1317849000.0
1,,2,5,False,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652000.0,1332649000.0
2,,0,3,False,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,...,False,[],0,3,0,3,,jacquibatman7,1319650000.0,1319646000.0
3,,0,1,True,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855000.0,1322855000.0
4,,6,6,False,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373658000.0,1373654000.0


In [5]:
train.shape

(4040, 32)

In [11]:
pizza = train[train['requester_received_pizza']==True]
no_pizza = train[train['requester_received_pizza']==False]
print(len(pizza))
print(len(no_pizza))

994
3046


In [12]:
pizza.describe()

Unnamed: 0,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,request_number_of_comments_at_retrieval,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,...,requester_number_of_posts_at_retrieval,requester_number_of_posts_on_raop_at_request,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,unix_timestamp_of_request,unix_timestamp_of_request_utc
count,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,...,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0
mean,2.566398,8.10161,5.261569,276.766844,828.179075,28.563757,579.609191,126.528169,355.248491,1.429577,...,47.98994,0.141851,1.725352,19.509054,1372.060362,3572.609658,4923.7,10941.21,1338609000.0,1338606000.0
std,3.390663,13.085617,5.800128,312.79121,353.852857,96.767955,262.02977,202.328504,377.352464,5.914937,...,89.005244,0.526249,0.900946,21.352374,5419.691555,9161.651778,42166.37,68192.21,23179880.0,23179560.0
min,0.0,0.0,0.0,0.0,45.291562,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-39.0,-9.0,0.0,2.0,1306095000.0,1306091000.0
25%,1.0,3.0,2.0,39.110437,604.934381,0.0,352.477613,1.0,31.25,0.0,...,4.0,0.0,1.0,3.0,23.25,139.0,45.0,272.25,1316329000.0,1316325000.0
50%,2.0,5.0,4.0,179.528218,821.140781,0.0,622.100469,35.0,187.5,0.0,...,17.0,0.0,2.0,13.0,285.5,1207.0,562.0,2272.0,1336384000.0,1336381000.0
75%,3.0,9.0,7.0,396.927688,978.019358,0.0,827.571742,155.0,688.5,0.0,...,53.0,0.0,2.0,28.0,1382.75,4344.5,2937.25,8871.25,1359247000.0,1359247000.0
max,35.0,276.0,61.0,2124.36265,2733.29706,785.457685,928.507593,990.0,1000.0,88.0,...,987.0,5.0,9.0,128.0,155010.0,223708.0,1286864.0,2046482.0,1381297000.0,1381268000.0


In [33]:
print(no_pizza["requester_account_age_in_days_at_request"].mean())
print(pizza["requester_account_age_in_days_at_request"].mean())
print('\nrequest_number_of_comments_at_retrieval')
print(no_pizza["request_number_of_comments_at_retrieval"].mean())
print(pizza["request_number_of_comments_at_retrieval"].mean())
print('\nnumber_of_upvotes_of_request_at_retrieval')
print(no_pizza["number_of_upvotes_of_request_at_retrieval"].mean())
print(pizza["number_of_upvotes_of_request_at_retrieval"].mean())
print('\nrequester_number_of_comments_in_raop_at_request')
print(no_pizza["requester_number_of_comments_in_raop_at_request"].mean())
print(pizza["requester_number_of_comments_in_raop_at_request"].mean())
print('\nrequester_number_of_posts_at_retrieval')
print(no_pizza["requester_number_of_posts_at_retrieval"].mean())
print(pizza["requester_number_of_posts_at_retrieval"].mean())
print('\nrequester_number_of_subreddits_at_request')
print(no_pizza["requester_number_of_subreddits_at_request"].mean())
print(pizza["requester_number_of_subreddits_at_request"].mean())

no_pizza["requester_user_flair"]

247.34850245692647
276.7668435031487

request_number_of_comments_at_retrieval
2.0909389363099145
5.261569416498994

number_of_upvotes_of_request_at_retrieval
5.553512803676953
8.101609657947686

requester_number_of_comments_in_raop_at_request
0.3890347997373605
1.4295774647887325

requester_number_of_posts_at_retrieval
38.92022324359816
47.989939637826964

requester_number_of_subreddits_at_request
17.609323703217335
19.509054325955734


0       None
1       None
2       None
3       None
4       None
        ... 
4034    None
4035    None
4037    None
4038    None
4039    None
Name: requester_user_flair, Length: 3046, dtype: object

In [28]:
print(len(pizza["request_title"])

5       [REQUEST] I'll give a two week xbox live code ...
9               [REQUEST]We're in need of some om noms...
10      [REQUEST] Bummed out in Chicago. Too broke to ...
16      [request] Cookeville, TN. My dog recently died...
18      [Request] Virginia. Girlfriend and I our sick,...
                              ...                        
4004    [request] broken shoulder, out of work, very h...
4008    [Request] Atlanta, Georgia Suffered from a sad...
4012    [REQUEST] Essex Junction, VT - Nearly broke, n...
4024    [request] Unemployment check never came, waiti...
4036    [Request][USA] Papa Johns is giving away one f...
Name: request_title, Length: 994, dtype: object
