## Random Acts Of Pizza
### Applying Topic Modelling On Combined Data

In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [2]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,10

In [3]:
data_train = pd.read_json('../../Dataset/Random Acts Of Pizza/train.json')
data_test = pd.read_json('../../Dataset/Random Acts Of Pizza/test.json')

y = data_train.pop('requester_received_pizza')
request_id = data_test['request_id']

In [4]:
not_present = []
for i in data_train.columns:
    if i not in data_test.columns:
        not_present.append(i)
data_train.drop(labels=not_present,axis=1,inplace=True)

In [5]:
## Combining the training and testing data

data = pd.concat([data_train,data_test])
data_copy = data.copy()
data.shape

(5671, 17)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 1630
Data columns (total 17 columns):
giver_username_if_known                               5671 non-null object
request_id                                            5671 non-null object
request_text_edit_aware                               5671 non-null object
request_title                                         5671 non-null object
requester_account_age_in_days_at_request              5671 non-null float64
requester_days_since_first_post_on_raop_at_request    5671 non-null float64
requester_number_of_comments_at_request               5671 non-null int64
requester_number_of_comments_in_raop_at_request       5671 non-null int64
requester_number_of_posts_at_request                  5671 non-null int64
requester_number_of_posts_on_raop_at_request          5671 non-null int64
requester_number_of_subreddits_at_request             5671 non-null int64
requester_subreddits_at_request                       5671 non-null obj

In [7]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_topics = 20
n_top_words = 20


def applyLDA(data_samples):
    
    # Use tf-idf features for NMF.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    
    # Fit the NMF model
    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
    lda.fit(tf)
    return lda.transform(tf)

In [8]:
def applyNMF(data_samples):
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0,min_df=1,stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
    nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
    return nmf.transform(tfidf)

In [9]:
parent_data = data.copy()

In [10]:
## Applying Non Negative Matrix Factorisation on Request Posts

topics = applyNMF(data['request_text_edit_aware'])
print(topics.shape)

Extracting tf-idf features for NMF...
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
(5671, 20)


In [11]:
for i in range(topics.shape[1]):
    data['request_text_topic_{}'.format(i)] = topics[:,i]
data.head()

Unnamed: 0,giver_username_if_known,request_id,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,requester_days_since_first_post_on_raop_at_request,requester_number_of_comments_at_request,requester_number_of_comments_in_raop_at_request,requester_number_of_posts_at_request,requester_number_of_posts_on_raop_at_request,...,request_text_topic_10,request_text_topic_11,request_text_topic_12,request_text_topic_13,request_text_topic_14,request_text_topic_15,request_text_topic_16,request_text_topic_17,request_text_topic_18,request_text_topic_19
0,,t3_l25d7,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.010389,0.0,0.0,0.0,0.013289,0.0,0.0
1,,t3_rcb83,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,0.0,0,0,15,0,...,0.0,0.0,0.0,0.0,0.135888,0.0,0.0,0.0,0.0,0.024505
2,,t3_lpu5j,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,t3_mxvj3,"It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,0.0,36,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009386
4,,t3_1i6486,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,101.606505,140,2,14,0,...,0.0,0.0,0.0,0.034902,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
## Changing the "giver_username" column to 0/1

data.giver_username_if_known = data.giver_username_if_known.map({'N/A':0})
data.giver_username_if_known.fillna(1,inplace=True)

In [13]:
data.giver_username_if_known.unique()

array([ 0.,  1.])

In [14]:
## Adding the length of the request as the feature

data['request_length'] = [len(x) for x in parent_data['request_text_edit_aware']]

In [15]:
subreddits = map(lambda x: ' '.join(x),parent_data['requester_subreddits_at_request'])

In [16]:
## Applying NMF on Subreddits

subreddits_topics = applyNMF(subreddits)

Extracting tf-idf features for NMF...
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...


In [17]:
for i in range(subreddits_topics.shape[1]):
    data['subreddit_topics_{}'.format(i)] = subreddits_topics[:,i]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 1630
Data columns (total 58 columns):
giver_username_if_known                               5671 non-null float64
request_id                                            5671 non-null object
request_text_edit_aware                               5671 non-null object
request_title                                         5671 non-null object
requester_account_age_in_days_at_request              5671 non-null float64
requester_days_since_first_post_on_raop_at_request    5671 non-null float64
requester_number_of_comments_at_request               5671 non-null int64
requester_number_of_comments_in_raop_at_request       5671 non-null int64
requester_number_of_posts_at_request                  5671 non-null int64
requester_number_of_posts_on_raop_at_request          5671 non-null int64
requester_number_of_subreddits_at_request             5671 non-null int64
requester_subreddits_at_request                       5671 non-null ob

In [18]:
count = dict()
for i in range(parent_data.shape[0]):
    if parent_data.iloc[i,-7] == True:
        for subreddit in parent_data.iloc[i,-6]:
            try:
                count[subreddit] += 1
            except:
                count[subreddit] = 1

In [19]:
count_series = pd.Series(count.values(),index = count.keys())
count_series.sort_values(ascending=False,inplace=True)

In [20]:
subreddits = set(count_series[count_series > 300].index)

In [21]:
## Counting the important subreddit and checking their presence

data['subreddit_count'] = map(lambda x: len(set(x).intersection(subreddits)),\
                              parent_data['requester_subreddits_at_request'])

In [22]:
data.drop(labels=[i for i in data.columns if data[i].dtype == 'object'],axis=1,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5671 entries, 0 to 1630
Data columns (total 54 columns):
giver_username_if_known                               5671 non-null float64
requester_account_age_in_days_at_request              5671 non-null float64
requester_days_since_first_post_on_raop_at_request    5671 non-null float64
requester_number_of_comments_at_request               5671 non-null int64
requester_number_of_comments_in_raop_at_request       5671 non-null int64
requester_number_of_posts_at_request                  5671 non-null int64
requester_number_of_posts_on_raop_at_request          5671 non-null int64
requester_number_of_subreddits_at_request             5671 non-null int64
requester_upvotes_minus_downvotes_at_request          5671 non-null int64
requester_upvotes_plus_downvotes_at_request           5671 non-null int64
unix_timestamp_of_request                             5671 non-null int64
unix_timestamp_of_request_utc                         5671 non-null int64

In [23]:
X = data.iloc[0:y.shape[0],:]
print X.shape[0] == data_train.shape[0]

True


In [24]:
test = data.iloc[y.shape[0]:,:]
print test.shape[0] == data_test.shape[0]

True


In [25]:
y = y.map({False:0,True:1})
y.head()

0    0
1    0
2    0
3    0
4    0
Name: requester_received_pizza, dtype: int64

In [27]:
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_auc_score,accuracy_score

In [29]:
rfc = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier()

In [30]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=7)

In [31]:
print X_train.shape,y_train.shape

(2828, 54) (2828,)


In [32]:
rfc.fit(X_train,y_train)
xgb.set_params(max_depth=10,n_estimators=200)
xgb.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
test_rfc = rfc.predict_proba(X_test)
test_xgb  = xgb.predict_proba(X_test)
print test_rfc.shape

(1212, 2)


In [32]:
max_score = 0
xgb_weight,rfc_weight=0,0
for i in np.arange(0.9,0.1,-0.1):
    score = roc_auc_score(y_test,(i*test_xgb+(1-i)*test_rfc)[:,1])
    if score > max_score:
        max_score = score
        xgb_weight = i
        rfc_weight = 1-i
print 'Maximum score: {}'.format(max_score)
print 'XGB Weight: {}'.format(xgb_weight)
print 'RFC Weight: {}'.format(rfc_weight)

Maximum score: 0.779914876667
XGB Weight: 0.3
RFC Weight: 0.7


In [33]:
test_rfc = rfc.predict_proba(test)
test_xgb  = xgb.predict_proba(test)

In [34]:
testPred = np.asarray([row[1] for row in ((xgb_weight*test_xgb+(rfc_weight)*test_rfc))])
print testPred.shape

(1631,)


In [35]:
import csv as csv

In [36]:
fp = open("submission.csv",'wb')
p = csv.writer(fp)
p.writerow(['request_id','requester_received_pizza'])
for i in range(len(testPred)):
    p.writerow([request_id[i],testPred[i]])

Submitting with probabilities of 1 rather than classes can make a huge difference on the leaderboard score as the penalisatiion by error function will decrease. =)