In [89]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error

### Step 1: Read the train, test and sample submission datasets

In [2]:
train      = pd.read_csv("../data/train.csv")
test       = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/Sample_Submission_Tm9Lura.csv")

In [None]:
train.head()

In [None]:
# Combine train and test to do encoding of categorical variables

In [3]:
frames = [train, test]
input = pd.concat(frames)

print input.shape
input.head()

(783667, 12)


Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,,,P00069042,8370,2,1000001
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200,2,1000001
2,0-17,A,F,0,10,12,,,P00087842,1422,2,1000001
3,0-17,A,F,0,10,12,14.0,,P00085442,1057,2,1000001
4,55+,C,M,0,16,8,,,P00285442,7969,4+,1000002


In [4]:
input.dtypes

Age                            object
City_Category                  object
Gender                         object
Marital_Status                  int64
Occupation                      int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Product_ID                     object
Purchase                      float64
Stay_In_Current_City_Years     object
User_ID                         int64
dtype: object

In [5]:
#Replace missing values with -999

input.fillna(999, inplace=True)

In [6]:
input.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,999,999,P00069042,8370,2,1000001
1,0-17,A,F,0,10,1,6,14,P00248942,15200,2,1000001
2,0-17,A,F,0,10,12,999,999,P00087842,1422,2,1000001
3,0-17,A,F,0,10,12,14,999,P00085442,1057,2,1000001
4,55+,C,M,0,16,8,999,999,P00285442,7969,4+,1000002


In [7]:
#Create target column
target = input.Purchase

In [8]:
target = np.array(target)

In [9]:
#Drop purchase from input
input.drop(["Purchase"], axis=1, inplace=True)

In [10]:
print input.columns, input.dtypes

Index([u'Age', u'City_Category', u'Gender', u'Marital_Status', u'Occupation',
       u'Product_Category_1', u'Product_Category_2', u'Product_Category_3',
       u'Product_ID', u'Stay_In_Current_City_Years', u'User_ID'],
      dtype='object') Age                            object
City_Category                  object
Gender                         object
Marital_Status                  int64
Occupation                      int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Product_ID                     object
Stay_In_Current_City_Years     object
User_ID                         int64
dtype: object


In [11]:
#Convert all the columns to string 
input = input.applymap(str)
input.dtypes

Age                           object
City_Category                 object
Gender                        object
Marital_Status                object
Occupation                    object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Product_ID                    object
Stay_In_Current_City_Years    object
User_ID                       object
dtype: object

In [12]:
# Have a copy of the pandas dataframe. Will be useful later on
input_pd = input.copy()

In [14]:
#Convert categorical to numeric using LabelEncoder

input = np.array(input)

for i in range(input.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:, i] = lbl.transform(input[:, i])

In [18]:
input = input.astype(int)

### First Model: `xgboost`

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

In [None]:
xgtrain = xgb.DMatrix(input[:train.shape[0],:], label=target[:train.shape[0]])
watchlist = [(xgtrain, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain, num_rounds)

In [None]:
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(input[train.shape[0]:,:]))
model_1_predict[model_1_predict<0] = 25
submission.Purchase = model_1_predict
submission.to_csv("../submission/submit_13.csv", index=False)

# 12 has been best so far. Avg of 4,5, 7 is 8. 0.75 times 8 and 0.25 times 11
# 13 has 2493. 6/3000 model
# 14: 2476 0.8 12 and 0.2 13
# 15:3400 
# 16: 2511 8/1450
# 17: 2513 8/1750
# 18: 2477 0.75 of 14 and 0.25 of 16
# 19: 2480 0.6 of 14 and 0.4 of 16
# 20: 2900. ET 
# 21: stacking: 2628
# 22: 21 and 14 (0.25, 0.75) : 2480

### Second Model

In [None]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(input.astype(int))

In [None]:
tfidf

In [None]:
input_tfidf = tfidf.toarray()

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 12
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1800

In [None]:
xgtrain = xgb.DMatrix(input_tfidf[:train.shape[0],:], label=target[:train.shape[0]])
watchlist = [(xgtrain, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain, num_rounds)

In [None]:
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(input_tfidf[train.shape[0]:,:]))
model_1_predict[model_1_predict<0] = 25
submission.Purchase = model_1_predict
submission.to_csv("../submission/submit_15.csv", index=False)

### Third model

In [None]:
input_w_tfidf =  np.concatenate((input, input_tfidf), axis=1)

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1750

xgtrain = xgb.DMatrix(input_w_tfidf[:train.shape[0],:], label=target[:train.shape[0]])
watchlist = [(xgtrain, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain, num_rounds)

model_1_predict = model_1_xgboost.predict(xgb.DMatrix(input_w_tfidf[train.shape[0]:,:]))
model_1_predict[model_1_predict<0] = 25
submission.Purchase = model_1_predict
submission.to_csv("../submission/submit_17.csv", index=False)

In [None]:
print train.shape, train[train.Marital_Status==0].shape

### Fourth Model

In [None]:
model_4_et = ExtraTreesRegressor(n_estimators=1450, max_depth=8,min_samples_split=10, min_samples_leaf=10, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_4_et.fit(input_w_tfidf[:train.shape[0],:], target[:train.shape[0]])

In [None]:
model_4_predict = model_4_et.predict(input_w_tfidf[train.shape[0]:,:])

In [None]:
#model_4_predict[model_4_predict<0] = 25
submission.Purchase = model_4_predict
submission.to_csv("../submission/submit_20.csv", index=False)

In [None]:
# This gets 2900 on LB