In [1]:
from nltk import word_tokenize

In [11]:
sent = 'HE is walking with 3 dogs to SCHOOL.'

# Prepare functions we need
# Text Cleaning

## 1) Remove Stopwords & Punctuation

In [12]:
# Stopwords from nltk
from nltk.corpus import stopwords
stopwords_nltk_en = set(stopwords.words('english'))

# Stopwords from stopwords-json
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])

# Punctuation
from string import punctuation
stopwords_punct = set(punctuation)

# Combine the stopwords. Its a lot longer so I'm not printing it out...
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

In [13]:
# Testing: stopwords
print([word for word in word_tokenize(sent) if word not in stoplist_combined])

['HE', 'walking', '3', 'dogs', 'SCHOOL']


In [14]:
# def remove_stopwords(sent):
#     '''remove stopwords & digits'''
#     return [word for word in word_tokenize(sent) if word not in stoplist_combined and not word.isdigit()]
    
# #     words = []
# #     for word in word_tokenize(sent):
# #         if word not in stoplist_combined and not word.isdigit():
# #             words.append(word)
# #     return words

# remove_stopwords(sent)

## 2) Lemmatization
having -> have

Lemmatization requires POS (Part of Speech) tag (e.g. noun, verb, adj., adv.)

In [15]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer


wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(sent): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(sent))]

In [16]:
# Testing: lemmatize_sent()
lemmatize_sent(sent)

['he', 'be', 'walk', 'with', '3', 'dog', 'to', 'school', '.']

In [17]:
for word, tag in pos_tag(word_tokenize(sent)):
    print (word, tag)

HE NNP
is VBZ
walking VBG
with IN
3 CD
dogs NNS
to TO
SCHOOL NNP
. .


## 3) preprocess_text = Lemmatize + Remove Stopwords

In [18]:
def preprocess_text(text):
    '''lemmatize_sent + remove stopwords'''
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    return [word for word in lemmatize_sent(text) 
            if word not in stoplist_combined
            and not word.isdigit()]

In [19]:
preprocess_text(sent)

['walk', 'dog', 'school']

# Let's start eating pizza

## Train set

In [53]:
import json

with open('./input/train.json') as fin:
    trainjson = json.load(fin)

In [54]:
trainjson[0]

{'giver_username_if_known': 'N/A',
 'number_of_downvotes_of_request_at_retrieval': 0,
 'number_of_upvotes_of_request_at_retrieval': 1,
 'post_was_edited': False,
 'request_id': 't3_l25d7',
 'request_number_of_comments_at_retrieval': 0,
 'request_text': 'Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated',
 'request_text_edit_aware': 'Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated',
 'request_title': 'Request Colorado Springs Help Us Please',
 'requester_accoun

In [55]:
print('UID:\t', trainjson[0]['request_id'], '\n')
print('Title:\t', trainjson[0]['request_title'], '\n')
print('Text:\t', trainjson[0]['request_text_edit_aware'], '\n')
print('Tag:\t', trainjson[0]['requester_received_pizza'], end='\n')

UID:	 t3_l25d7 

Title:	 Request Colorado Springs Help Us Please 

Text:	 Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated 

Tag:	 False


## Convert json to dataframe

In [56]:
import pandas as pd
df = pd.io.json.json_normalize(trainjson) # Pandas magic... 
df_train = df[['request_id', 'request_title', 
               'request_text_edit_aware', 
               'requester_received_pizza']]
df_train.head()

Unnamed: 0,request_id,request_title,request_text_edit_aware,requester_received_pizza
0,t3_l25d7,Request Colorado Springs Help Us Please,Hi I am in need of food for my 4 children we a...,False
1,t3_rcb83,"[Request] California, No cash and I could use ...",I spent the last money I had on gas today. Im ...,False
2,t3_lpu5j,"[Request] Hungry couple in Dundee, Scotland wo...",My girlfriend decided it would be a good idea ...,False
3,t3_mxvj3,"[Request] In Canada (Ontario), just got home f...","It's cold, I'n hungry, and to be completely ho...",False
4,t3_1i6486,[Request] Old friend coming to visit. Would LO...,hey guys:\n I love this sub. I think it's grea...,False


## Test set

In [57]:
import json

with open('./input/test.json') as fin:
    testjson = json.load(fin)

In [58]:
print('UID:\t', testjson[0]['request_id'], '\n')
print('Title:\t', testjson[0]['request_title'], '\n')
print('Text:\t', testjson[0]['request_text_edit_aware'], '\n')
#print('Tag:\t', testjson[0]['requester_received_pizza'], end='\n')

UID:	 t3_i8iy4 

Title:	 [request] pregger gf 95 degree house and no food.. promise to pay it forward! Northern Colorado 

Text:	 Hey all! It's about 95 degrees here and our kitchen is pretty much empty save for some bread and cereal.  My girlfriend/fiance is 8 1/2 months pregnant and we could use a good meal.  We promise to pay it forward when we get money! Thanks so much in advance! 



## Convert json to dataframe

In [59]:
import pandas as pd
df = pd.io.json.json_normalize(testjson) # Pandas magic... 
df_test = df[['request_id', 'request_title', 
               'request_text_edit_aware']]
df_test.head()

Unnamed: 0,request_id,request_title,request_text_edit_aware
0,t3_i8iy4,[request] pregger gf 95 degree house and no fo...,Hey all! It's about 95 degrees here and our ki...
1,t3_1mfqi0,"[Request] Lost my job day after labour day, st...",I didn't know a place like this exists! \n\nI ...
2,t3_lclka,(Request) pizza for my kids please?,Hi Reddit. Im a single dad having a really rou...
3,t3_1jdgdj,[Request] Just moved to a new state(Waltham MA...,Hi I just moved to Waltham MA from my home sta...
4,t3_t2qt4,"[Request] Two girls in between paychecks, we'v...",We're just sitting here near indianapolis on o...


## Now we have df_train & df_test

In [84]:
from sklearn.model_selection import train_test_split 

train, val = train_test_split(df_train, test_size=0.2)
print('train.shape:', train.shape)
print('val.shape:', val.shape)

train.shape: (3232, 4)
val.shape: (808, 4)


# Vectorize the train, val and test set

    fit_transform: train
    tranform:      val & test

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer and 
# override the analyzer totally with the preprocess_text().
# Note: the vectorizer is just an 'empty' object now.
count_vect = CountVectorizer(analyzer=preprocess_text)

# When we use `CounterVectorizer.fit_transform`,
# we essentially create the dictionary and 
# vectorize our input text at the same time.
train_set = count_vect.fit_transform(train['request_text_edit_aware'])   # sparse matrix
train_tags = train['requester_received_pizza']

# When vectorizing the validation data, we use `CountVectorizer.transform()`.
val_set = count_vect.transform(val['request_text_edit_aware'])
val_tags = val['requester_received_pizza']

# When vectorizing the test data, we use `CountVectorizer.transform()`.
test_set = count_vect.transform(df_test['request_text_edit_aware'])

In [87]:
print('train_set.shape:', train_set.shape)
print('train_tags.shape:', train_tags.shape)
print()
print('val_set.shape:', val_set.shape)
print('val_tags.shape:', val_tags.shape)
print()
print('test_set.shape:', test_set.shape)

train_set.shape: (3232, 9570)
train_tags.shape: (3232,)

val_set.shape: (808, 9570)
val_tags.shape: (808,)

test_set.shape: (1631, 11075)


# Use Naive Bayes classifier in sklearn

In [67]:
## Training

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB() 

# To train the classifier, simple do 
clf.fit(train_set, train_tags) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [68]:
## Prediction

# To predict our tags (i.e. whether requesters get their pizza), 
# we feed the vectorized `test_set` to .predict()
predictions_valid = clf.predict(val_set)

In [69]:
## Evaluation

from sklearn.metrics import accuracy_score

print('Pizza reception accuracy = {}'.format(
        accuracy_score(predictions_valid, valid_tags) * 100)
     )

Pizza reception accuracy = 73.76237623762376


# Now lets use the full training data set and re-vectorize and retrain the classifier

In [73]:
## Vectorizing

count_vect = CountVectorizer(analyzer=preprocess_text)

full_train_set = count_vect.fit_transform(df_train['request_text_edit_aware'])   # train+val
full_tags = df_train['requester_received_pizza']

# Note: We have to re-vectorize the test set since
#       now our vectorizer is different using the full 
#       training set.
test_set = count_vect.transform(df_test['request_text_edit_aware'])

In [74]:
## Training

# Reset the classifier
clf = MultinomialNB() 
clf.fit(full_train_set, full_tags) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [75]:
## Prediction

predictions = clf.predict(test_set)

**Note:** Since we don't have the `requester_received_pizza` field in test data, we can't measure accuracy. But we can do some exploration as shown below.

## From the training data, we had 24% pizza giving rate

In [76]:
success_rate = sum(df_train['requester_received_pizza']) / len(df_train) * 100
print(str('Of {} requests, only {} gets their pizzas,'
          ' {}% success rate...'.format(len(df_train), 
                                        sum(df_train['requester_received_pizza']), 
                                       success_rate)
         )
     )

Of 4040 requests, only 994 gets their pizzas, 24.603960396039604% success rate...


## our classifier give 3% success rate

In [77]:
success_rate = sum(predictions) / len(predictions) * 100
print(str('Of {} requests, only {} gets their pizzas,'
          ' {}% success rate...'.format(len(predictions), 
                                        sum(predictions), 
                                       success_rate)
         )
     )

Of 1631 requests, only 51 gets their pizzas, 3.126916002452483% success rate...


# Submission

In [78]:
df_sample_submission = pd.read_csv('./input/sampleSubmission.csv')
df_sample_submission.head()

Unnamed: 0,request_id,requester_received_pizza
0,t3_i8iy4,0
1,t3_1mfqi0,0
2,t3_lclka,0
3,t3_1jdgdj,0
4,t3_t2qt4,0


In [79]:
# We've kept the `request_id` previous in the `df_test` dataframe.
# We can simply merge that column with our predictions.
df_output = pd.DataFrame({'request_id': list(df_test['request_id']), 
                          'requester_received_pizza': list(predictions)}
                        )
# Convert the predictions from boolean to integer.
df_output['requester_received_pizza'] = df_output['requester_received_pizza'].astype(int)
df_output.head()

Unnamed: 0,request_id,requester_received_pizza
0,t3_i8iy4,0
1,t3_1mfqi0,0
2,t3_lclka,0
3,t3_1jdgdj,0
4,t3_t2qt4,0


In [80]:
# Create the csv file.
df_output.to_csv('./output/basic-nlp-submission.csv')