In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error

### Step 1: Read the train, test and sample submission datasets

In [2]:
train      = pd.read_csv("../data/train.csv")
test       = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/Sample_Submission_Tm9Lura.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
# Combine train and test to do encoding of categorical variables

In [5]:
frames = [train, test]
input = pd.concat(frames)

print input.shape
input.head()

(783667, 12)


Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,,,P00069042,8370,2,1000001
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200,2,1000001
2,0-17,A,F,0,10,12,,,P00087842,1422,2,1000001
3,0-17,A,F,0,10,12,14.0,,P00085442,1057,2,1000001
4,55+,C,M,0,16,8,,,P00285442,7969,4+,1000002


In [6]:
input.dtypes

Age                            object
City_Category                  object
Gender                         object
Marital_Status                  int64
Occupation                      int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Product_ID                     object
Purchase                      float64
Stay_In_Current_City_Years     object
User_ID                         int64
dtype: object

In [7]:
#Replace missing values with -999

input.fillna(999, inplace=True)

In [8]:
input.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,999,999,P00069042,8370,2,1000001
1,0-17,A,F,0,10,1,6,14,P00248942,15200,2,1000001
2,0-17,A,F,0,10,12,999,999,P00087842,1422,2,1000001
3,0-17,A,F,0,10,12,14,999,P00085442,1057,2,1000001
4,55+,C,M,0,16,8,999,999,P00285442,7969,4+,1000002


In [9]:
#Create target column
target = input.Purchase

In [10]:
target = np.array(target)

In [11]:
#Drop purchase from input
input.drop(["Purchase"], axis=1, inplace=True)

In [12]:
print input.columns, input.dtypes

Index([u'Age', u'City_Category', u'Gender', u'Marital_Status', u'Occupation',
       u'Product_Category_1', u'Product_Category_2', u'Product_Category_3',
       u'Product_ID', u'Stay_In_Current_City_Years', u'User_ID'],
      dtype='object') Age                            object
City_Category                  object
Gender                         object
Marital_Status                  int64
Occupation                      int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Product_ID                     object
Stay_In_Current_City_Years     object
User_ID                         int64
dtype: object


In [13]:
#Convert all the columns to string 
input = input.applymap(str)
input.dtypes

Age                           object
City_Category                 object
Gender                        object
Marital_Status                object
Occupation                    object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Product_ID                    object
Stay_In_Current_City_Years    object
User_ID                       object
dtype: object

In [14]:
# Have a copy of the pandas dataframe. Will be useful later on
input_pd = input.copy()

In [15]:
#Convert categorical to numeric using LabelEncoder

input = np.array(input)

for i in range(input.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:, i] = lbl.transform(input[:, i])

In [16]:
input = input.astype(int)

### Model 5: Stacking

In [17]:
# Split dataset into two. First level models to create meta features to feed into a second level model

In [18]:
first_stage_rows = np.random.randint(train.shape[0], size = np.int(train.shape[0]/2))

In [19]:
train_np   = input[:train.shape[0], :]
target_np  = target[:train.shape[0]]
train_fs   = train_np[first_stage_rows, :]
target_fs  = target_np[first_stage_rows]
train_ss   = train_np[-first_stage_rows, :]
target_ss  = target_np[-first_stage_rows]

In [20]:
print train_fs.shape, target_fs.shape, train_ss.shape, target_ss.shape

(275034, 11) (275034,) (275034, 11) (275034,)


In [21]:
train_fs

array([[   3,    0,    0, ..., 2242,    3, 5501],
       [   2,    1,    1, ..., 2612,    2, 3938],
       [   3,    2,    0, ...,   49,    0, 2579],
       ..., 
       [   1,    2,    1, ..., 2344,    2,  852],
       [   2,    1,    1, ..., 1355,    2, 3030],
       [   1,    2,    1, ..., 1606,    1,  224]])

In [22]:
train_ss

array([[   1,    1,    0, ..., 2491,    2, 1086],
       [   2,    2,    1, ..., 1887,    2, 2574],
       [   0,    2,    1, ...,  370,    1, 3997],
       ..., 
       [   3,    1,    1, ..., 1788,    1, 5731],
       [   2,    2,    1, ...,  251,    3, 3619],
       [   1,    0,    1, ...,  172,    0,  512]])

In [23]:
xgtrain = xgb.DMatrix(train_fs, label=target_fs)
watchlist = [(xgtrain, 'train')]

# Model 1: 6/3000

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

model_1 = xgb.train(plst, xgtrain, num_rounds)

# Model 2: 8/1420

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1420

model_2 = xgb.train(plst, xgtrain, num_rounds)

# Model 3: 10/1200

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1200

model_3 = xgb.train(plst, xgtrain, num_rounds)

# Model 4: 12/800

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 12
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 800

model_4 = xgb.train(plst, xgtrain, num_rounds)

In [24]:
# This set of models will be ExtraTrees

# Model 5: 8/1450

model_5 = ExtraTreesRegressor(n_estimators=1450, 
                              max_depth=8,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_5.fit(train_fs, target_fs)

# Model 6: 6/3000

model_6 = ExtraTreesRegressor(n_estimators=3000, 
                              max_depth=6,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_6.fit(train_fs, target_fs)

# Model 7: 12/800

model_7 = ExtraTreesRegressor(n_estimators=800, 
                              max_depth=12,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_7.fit(train_fs, target_fs)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   22.9s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   52.4s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:  2.9min finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    3.9s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   18.2s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   42.3s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  2.9min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:  4.7min finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   26.0

ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=12,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=10,
          min_samples_split=10, min_weight_fraction_leaf=0.0,
          n_estimators=800, n_jobs=6, oob_score=True, random_state=123,
          verbose=1, warm_start=False)

In [25]:
# This set of models will be RandomForest

# Model 8: 6/3000
model_8 = RandomForestRegressor(n_estimators=3000, max_depth=6, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_8.fit(train_fs, target_fs)

# Model 9: 8/1500
model_9 = RandomForestRegressor(n_estimators=1500, max_depth=8, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_9.fit(train_fs, target_fs)

# Model 10: 12/800
model_10 = RandomForestRegressor(n_estimators=800, max_depth=12, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_10.fit(train_fs, target_fs)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=10,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=800, n_jobs=6, oob_score=True, random_state=123,
           verbose=0, warm_start=False)

In [26]:
model_1_predict = model_1.predict(xgb.DMatrix(train_ss))
model_2_predict = model_2.predict(xgb.DMatrix(train_ss))
model_3_predict = model_3.predict(xgb.DMatrix(train_ss))
model_4_predict = model_4.predict(xgb.DMatrix(train_ss))
model_5_predict = model_5.predict(train_ss)
model_6_predict = model_6.predict(train_ss)
model_7_predict = model_7.predict(train_ss)
model_8_predict = model_8.predict(train_ss)
model_9_predict = model_9.predict(train_ss)
model_10_predict = model_10.predict(train_ss)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    2.0s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    3.8s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:    6.9s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.5s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    1.3s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    2.5s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    4.1s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:    6.1s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:    8.9s
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:   13.2s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    1.2

In [27]:
train_ss_w_meta = np.concatenate((train_ss, np.vstack((model_1_predict, model_2_predict, model_3_predict, 
                                                       model_4_predict, model_5_predict,
              model_6_predict, model_7_predict, model_8_predict, model_9_predict, model_10_predict)).T), axis=1)

In [28]:
# Second stage model with meta features

In [29]:
kfolds = KFold(train_ss_w_meta.shape[0], n_folds=5)

In [32]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1400

In [34]:
for train_index, validation_index in kfolds:
    
    train_X, validation_X = train_ss_w_meta[train_index, :], train_ss_w_meta[validation_index, :]
    train_y, validation_y = target_ss[train_index], target_ss[validation_index]
    
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    watchlist = [(xgtrain, 'train')]
    model_cv_xgboost = xgb.train(plst, xgtrain, num_rounds)
    model_cv_predict = model_cv_xgboost.predict(xgb.DMatrix(validation_X))
    print np.sqrt(mean_squared_error(validation_y, model_cv_predict))

2033.81508009
2021.79506615
2022.45656832
2016.65021891
2037.1127153


In [40]:
# RMSE is around 2050.

In [41]:
# Training second stage model on all the second stage data now

In [35]:
xgtrain = xgb.DMatrix(train_ss_w_meta, label=target_ss)
watchlist = [(xgtrain, 'train')]
model_ss_xgboost = xgb.train(plst, xgtrain, num_rounds)

### Creating final prediction on test dataset

In [36]:
model_1_predict = model_1.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_2_predict = model_2.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_3_predict = model_3.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_4_predict = model_4.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_5_predict = model_5.predict(input[train.shape[0]:, :])
model_6_predict = model_6.predict(input[train.shape[0]:, :])
model_7_predict = model_7.predict(input[train.shape[0]:, :])
model_8_predict = model_8.predict(input[train.shape[0]:, :])
model_9_predict = model_9.predict(input[train.shape[0]:, :])
model_10_predict = model_10.predict(input[train.shape[0]:, :])

test_ss_w_meta = np.concatenate((input[train.shape[0]:, :], np.vstack((model_1_predict, model_2_predict, model_3_predict, 
                                                       model_4_predict, model_5_predict,
              model_6_predict, model_7_predict, model_8_predict, model_9_predict, model_10_predict)).T), axis=1)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.5s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    1.3s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    2.4s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    3.9s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:    4.5s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    0.9s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    1.7s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    2.8s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:    4.1s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 3000 out of 3000 | elapsed:    7.6s finished
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.9

In [37]:
model_ss_predict = model_ss_xgboost.predict(xgb.DMatrix(test_ss_w_meta))

In [38]:
np.max(model_ss_predict), np.min(model_ss_predict)

(24907.703, -348.80347)

In [39]:
submission.Purchase = model_ss_predict

In [52]:
submission.to_csv("../submission/submit_23.csv", index=False)

array([-348.8034668], dtype=float32)