In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Reading json file to understand its structure
import json
import pandas as pd
from pandas.io.json import json_normalize

# Converting Train data into a dataframe
with open('train.json') as train_file:
    dict_train = json.load(train_file)
    df_train = pd.io.json.json_normalize(dict_train)

print df_train.head()

# Extracting Required fields
df_train = df_train[['request_id', 'request_title', 
                     'request_text_edit_aware', 
                     'requester_received_pizza']]

df_train['requester_received_pizza'] = df_train['requester_received_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))

print df_train['requester_received_pizza'].head()
print '--------------------------------------------------------------------------------------'

# Converting Test data into a dataframe
with open('test.json') as test_file:
    dict_test = json.load(test_file)
    df_test = pd.io.json.json_normalize(dict_test)

# Extracting Required fields
df_test = df_test[['request_id', 'request_title', 
               'request_text_edit_aware']]

#print df_test.head()
print '--------------------------------------------------------------------------------------'

  giver_username_if_known  number_of_downvotes_of_request_at_retrieval  \
0                     N/A                                            0   
1                     N/A                                            2   
2                     N/A                                            0   
3                     N/A                                            0   
4                     N/A                                            6   

   number_of_upvotes_of_request_at_retrieval post_was_edited request_id  \
0                                          1           False   t3_l25d7   
1                                          5           False   t3_rcb83   
2                                          3           False   t3_lpu5j   
3                                          1            True   t3_mxvj3   
4                                          6           False  t3_1i6486   

   request_number_of_comments_at_retrieval  \
0                                        0   
1           

In [2]:
# Text Processing
from bs4 import BeautifulSoup  # HTML to text
from nltk.corpus import stopwords  # String cleaning
import nltk.data  # To load sentence tokenizer
import re

def preprocess_text( text ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    # print ("Length of the Review-",len(text))
    #review_text = BeautifulSoup(text,"html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 3. Tokenization : Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    words_M = [w for w in words if not w in stops]   
    #
    # 6. WordNetLemmatizer should better be used with POS tagging
    wl = nltk.WordNetLemmatizer()
    words_L = [wl.lemmatize(word) for word in words_M]
    #
    # 7. PorterStemmer(), LancasterStemmer(), SnowballStemmer()
    # stemming can often create non-existent words, whereas lemmas are actual words.
    ps = nltk.PorterStemmer()
    words_S = [ps.stem(word) for word in words_L]
    # 8. Join the words back into one string separated by space, 
    # and return the result.
    #print '--------'
    #print text
    #print '--------'
    #print " ".join( words_M )
    return( " ".join( words_S ))    

In [3]:
# Splitting Training data into training and validation data
from sklearn.model_selection import train_test_split 
df_train_t, df_train_v = train_test_split(df_train, test_size=0.2)

df_train.describe()

Unnamed: 0,requester_received_pizza
count,4040.0
mean,0.24604
std,0.430755
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer with custom analyzer
count_vect = CountVectorizer(analyzer=preprocess_text)

# CounterVectorizer.fit_transform - Create the dictionary and vectorizes input text
df_train_set = count_vect.fit_transform(df_train['request_text_edit_aware'])
df_train_tags = df_train['requester_received_pizza']

#print count_vect.vocabulary_

# CounterVectorizer.fit_transform - Create the dictionary and vectorizes input text
df_train_t_set = count_vect.fit_transform(df_train_t['request_text_edit_aware'])
df_train_t_tags = df_train_t['requester_received_pizza']

# Using CountVectorizer.transform() on Validation data
df_train_v_set = count_vect.transform(df_train_v['request_text_edit_aware'])
df_train_v_tags = df_train_v['requester_received_pizza']

# Using CountVectorizer.transform() on Test data
df_test_t_set = count_vect.transform(df_test['request_text_edit_aware'])
#df_test_t_tags = df_test['requester_received_pizza']

In [5]:
# Applying Naive Bayes classifier on Validation Dataset to Predict results
clf = MultinomialNB() 

clf.fit(df_train_t_set, df_train_t_tags) 

predictions_valid = clf.predict(df_train_v_set)

print('Pizza reception accuracy = {}'.format(accuracy_score(predictions_valid, df_train_v_tags) * 100))

Pizza reception accuracy = 75.0


In [6]:
# Applying Naive Bayes classifier on Full Dataset to Predict results
clf = MultinomialNB() 
clf.fit(df_train_set, df_train_tags) 

predictions = clf.predict(df_test_t_set)

In [7]:
# Uploading results to Kaggle
df_output = pd.DataFrame({'request_id': list(df_test['request_id']), 
                          'requester_received_pizza': list(predictions)})

df_output['requester_received_pizza'] = df_output['requester_received_pizza']

print df_output.shape
print df_output.head()

df_output.to_csv('submission.csv', index = False)

(1631, 2)
  request_id  requester_received_pizza
0   t3_i8iy4                         0
1  t3_1mfqi0                         0
2   t3_lclka                         0
3  t3_1jdgdj                         0
4   t3_t2qt4                         0


In [8]:
print 'Complete'

Complete
