## Random Acts Of Pizza
### Applying Topic Modelling On Combined Data

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [36]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,10

In [80]:
data_train = pd.read_json('../../Dataset/Random Acts Of Pizza/train.json')
data_test = pd.read_json('../../Dataset/Random Acts Of Pizza/test.json')
y = data_train.pop('requester_received_pizza')
request_id = data_test['request_id']

In [38]:
not_present = []
for i in data_train.columns:
    if i not in data_test.columns:
        not_present.append(i)
data_train.drop(labels=not_present,axis=1,inplace=True)

In [103]:
## Combining the training and testing data

data = pd.concat([data_train,data_test])
data.shape

(5671, 31)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 1630
Data columns (total 17 columns):
giver_username_if_known                               5671 non-null object
request_id                                            5671 non-null object
request_text_edit_aware                               5671 non-null object
request_title                                         5671 non-null object
requester_account_age_in_days_at_request              5671 non-null float64
requester_days_since_first_post_on_raop_at_request    5671 non-null float64
requester_number_of_comments_at_request               5671 non-null int64
requester_number_of_comments_in_raop_at_request       5671 non-null int64
requester_number_of_posts_at_request                  5671 non-null int64
requester_number_of_posts_on_raop_at_request          5671 non-null int64
requester_number_of_subreddits_at_request             5671 non-null int64
requester_subreddits_at_request                       5671 non-null obj

In [41]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def applyLDA(data_samples):
    
    # Use tf-idf features for NMF.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    
    # Fit the NMF model
    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
    lda.fit(tf)
    return lda.transform(tf)

In [42]:
def applyNMF(data_samples):
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
    nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
    return nmf.transform(tfidf)

In [43]:
parent_data = data.copy()

In [44]:
from nltk.corpus import stopwords

In [45]:
## Removing stopwords from the comments

data['request_text_edit_aware'] = map(lambda x: ' '.join([i for i in x.lower().split(' ') if i not in stopwords.words('english')]),\
    data['request_text_edit_aware'])

In [46]:
## Applying Non Negative Matrix Factorisation on Request Posts

topics = applyNMF(data['request_text_edit_aware'])
print(topics.shape)

Extracting tf-idf features for NMF...
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
(5671, 10)


In [47]:
t = np.asarray([np.argmax(row) for row in topics])

In [48]:
data['topics'] = t

In [49]:
## Changing the "giver_username" column to 0/1

data.giver_username_if_known = data.giver_username_if_known.map({'N/A':0})
data.giver_username_if_known.fillna(1,inplace=True)

In [50]:
data.giver_username_if_known.unique()

array([ 0.,  1.])

In [51]:
## Applying Non Negative Matrix Factorisation on Request Title

title_topics = applyNMF(parent_data['request_title'])

Extracting tf-idf features for NMF...
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...


In [52]:
title_topics_max = np.asarray([np.argmax(i) for i in title_topics])

In [53]:
data['request_title'] = title_topics_max

In [54]:
## Adding the length of the request as the feature

data['request_length'] = [len(x) for x in parent_data['request_text_edit_aware']]

In [55]:
subreddits = map(lambda x: ' '.join(x),parent_data['requester_subreddits_at_request'])

In [56]:
## Applying NMF on Subreddits

subreddits_topics = applyNMF(subreddits)

Extracting tf-idf features for NMF...
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...


In [57]:
t = np.asarray([np.argmax(row) for row in subreddits_topics])

In [58]:
data['subreddit_topics'] = t

In [59]:
count = dict()
for i in range(parent_data.shape[0]):
    if parent_data.iloc[i,-7] == True:
        for subreddit in parent_data.iloc[i,-6]:
            try:
                count[subreddit] += 1
            except:
                count[subreddit] = 1

In [60]:
count_series = pd.Series(count.values(),index = count.keys())
count_series.sort_values(ascending=False,inplace=True)

In [61]:
subreddits = set(count_series[count_series > 300].index)

In [62]:
## Counting the important subreddit and checking their presence

data['subreddit_count'] = map(lambda x: len(set(x).intersection(subreddits)),\
                              parent_data['requester_subreddits_at_request'])

In [63]:
data.drop(labels=[i for i in data.columns if data[i].dtype == 'object'],axis=1,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 1630
Data columns (total 17 columns):
giver_username_if_known                               5671 non-null float64
request_title                                         5671 non-null int64
requester_account_age_in_days_at_request              5671 non-null float64
requester_days_since_first_post_on_raop_at_request    5671 non-null float64
requester_number_of_comments_at_request               5671 non-null int64
requester_number_of_comments_in_raop_at_request       5671 non-null int64
requester_number_of_posts_at_request                  5671 non-null int64
requester_number_of_posts_on_raop_at_request          5671 non-null int64
requester_number_of_subreddits_at_request             5671 non-null int64
requester_upvotes_minus_downvotes_at_request          5671 non-null int64
requester_upvotes_plus_downvotes_at_request           5671 non-null int64
unix_timestamp_of_request                             5671 non-null int64

In [72]:
X = data.iloc[0:y.shape[0],:]
print X.shape[0] == data_train.shape[0]

True


In [94]:
test = data.iloc[y.shape[0]:,:]
print test.shape[0] == data_test.shape[0]

True


In [81]:
y = y.map({False:0,True:1})
y.head()

0    0
1    0
2    0
3    0
4    0
Name: requester_received_pizza, dtype: int64

In [92]:
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_auc_score,accuracy_score

In [83]:
rfc = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier()

In [84]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=7)

In [85]:
print X_train.shape,y_train.shape

(2828, 17) (2828,)


In [89]:
rfc.fit(X_train,y_train)
xgb.set_params(max_depth=10,n_estimators=200)
xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [90]:
test_rfc = rfc.predict_proba(X_test)
test_xgb  = xgb.predict_proba(X_test)
print test_rfc.shape,test_xgb.shape

(1212, 2) (1212, 2)


In [93]:
max_score = 0
xgb_weight,rfc_weight=0,0
for i in np.arange(0.9,0.1,-0.1):
    score = roc_auc_score(y_test,(i*test_xgb+(1-i)*test_rfc)[:,1])
    if score > max_score:
        max_score = score
        xgb_weight = i
        rfc_weight = 1-i
print 'Maximum score: {}'.format(max_score)
print 'XGB Weight: {}'.format(xgb_weight)
print 'RFC Weight: {}'.format(rfc_weight)

Maximum score: 0.763074619379
XGB Weight: 0.2
RFC Weight: 0.8


In [95]:
test_rfc = rfc.predict_proba(test)
test_xgb  = xgb.predict_proba(test)

In [99]:
testPred = np.asarray([np.argmax(row) for row in ((xgb_weight*test_xgb+(rfc_weight)*test_rfc))])
print testPred.shape

(1631,)


In [100]:
import csv as csv

In [102]:
fp = open("submission.csv",'wb')
p = csv.writer(fp)
p.writerow(['request_id','requester_received_pizza'])
for i in range(len(testPred)):
    p.writerow([request_id[i],testPred[i]])