In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('Sample_Submission_Tm9Lura.csv')

submission['User_ID'] = test['User_ID']
submission['Product_ID'] = test['Product_ID']

### Check data

In [3]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
train.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [6]:
len(train['User_ID'].unique())

5891

In [7]:
len(train['Product_ID'].unique())

3631

In [8]:
len(test['Product_ID'].unique())

3491

In [9]:
train['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [10]:
train['Occupation'].unique()

array([10, 16, 15,  7, 20,  9,  1, 12, 17,  0,  3,  4, 11,  8, 19,  2, 18,
        5, 14, 13,  6])

In [11]:
train['City_Category'].unique()

array(['A', 'C', 'B'], dtype=object)

In [12]:
train['Stay_In_Current_City_Years'].unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

In [13]:
train['Product_Category_1'].unique()

array([ 3,  1, 12,  8,  5,  4,  2,  6, 14, 11, 13, 15,  7, 16, 18, 10, 17,
        9, 20, 19])

In [14]:
train['Product_Category_2'].unique()

array([nan,  6., 14.,  2.,  8., 15., 16., 11.,  5.,  3.,  4., 12.,  9.,
       10., 17., 13.,  7., 18.])

In [15]:
train['Product_Category_3'].unique()

array([nan, 14., 17.,  5.,  4., 16., 15.,  8.,  9., 13.,  6., 12.,  3.,
       18., 11., 10.])

### Preprocess Input Data

In [16]:
# User_ID data preprocess. e.g. 1000002 -> 2
train['User_ID'] = train['User_ID'] - 1000000
test['User_ID'] = test['User_ID'] - 1000000

enc = LabelEncoder()
train['User_ID'] = enc.fit_transform(train['User_ID'])
test['User_ID'] = enc.transform(test['User_ID'])

In [17]:
# Product_ID preprocess e.g. P00069042 -> 69042
train['Product_ID'] = train['Product_ID'].str.replace('P00', '')
test['Product_ID'] = test['Product_ID'].str.replace('P00', '')

scaler = StandardScaler()
train['Product_ID'] = scaler.fit_transform(train['Product_ID'].reshape(-1, 1))
test['Product_ID'] = scaler.transform(test['Product_ID'].reshape(-1, 1))

# enc = LabelEncoder()
# train['Product_ID'] = enc.fit_transform(train['Product_ID'])
# test['Product_ID'] = enc.transform(test['Product_ID'])

  
  import sys


Note: Test Product_ID has new values. Thus LabelEncoder won't work directly.

In [18]:
len(train['Product_ID'].unique())

3631

In [19]:
len(test['Product_ID'].unique())

3491

In [20]:
cat_col = ['Gender', 'City_Category']
num_col = ['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Product_Category_1', 
           'Product_Category_2', 'Product_Category_3']

In [21]:
# Impute missing values

train = train.fillna(0)
test = test.fillna(0)

In [22]:
# Modify age column

train['Age'] = train['Age'].map({'0-17': 15,
                               '18-25': 21,
                               '26-35': 30,
                               '36-45': 40,
                               '46-50': 48,
                               '51-55': 53,
                               '55+': 60})
test['Age'] = test['Age'].map({'0-17': 15,
                               '18-25': 21,
                               '26-35': 30,
                               '36-45': 40,
                               '46-50': 48,
                               '51-55': 53,
                               '55+': 60})

In [23]:
# Modify Stay_In_Current_City_Years

train['Stay_In_Current_City_Years'] = train['Stay_In_Current_City_Years'].map({'0': 0,
                                                                               '1': 1,
                                                                                '2': 2,
                                                                                '3': 3,
                                                                                '4+': 4})
test['Stay_In_Current_City_Years'] = test['Stay_In_Current_City_Years'].map({'0': 0,
                                                                               '1': 1,
                                                                                '2': 2,
                                                                                '3': 3,
                                                                                '4+': 4})

In [24]:
# Encode categorical columns

encoder = LabelEncoder()

for col in cat_col:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

In [25]:
# Scale numerical columns

scaler = StandardScaler()

for col in num_col:
    train[col] = scaler.fit_transform(train[col].reshape(-1, 1))
    test[col] = scaler.transform(test[col].reshape(-1, 1))

  
  import sys


In [26]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,-1.028774,0,-1.741228,0.294864,0,0.109801,0,-0.610809,-1.083654,-0.614641,8370
1,0,0.722139,0,-1.741228,0.294864,0,0.109801,0,-1.118912,-0.118323,1.625106,15200
2,0,-0.845799,0,-1.741228,0.294864,0,0.109801,0,1.675656,-1.083654,-0.614641,1422
3,0,-0.869157,0,-1.741228,0.294864,0,0.109801,0,1.675656,1.168785,-0.614641,1057
4,1,1.077382,1,2.332202,1.214734,2,1.660861,0,0.659449,-1.083654,-0.614641,7969


###  Start training algorithms

In [27]:
X = train.drop(['Purchase'], axis=1)
y = train[['Purchase']]
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

Tried models:

lin_reg = LinearRegression() # rmse = 4609.92

tree_reg = DecisionTreeRegressor(random_state=0) #rmse = 3786.33

forest_reg = RandomForestRegressor(n_estimators=25, random_state=0) # rmse = 2786.273

ada_reg = AdaBoostRegressor(n_estimators=25, random_state=0) # rmse = 3855.36

gradient_reg = GradientBoostingRegressor(n_estimators=40,  learning_rate=1.0, random_state=0) # rmse=2829.88(40, 1.0)

In [28]:
# Validating the model

xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))

print xgb_reg, rmse

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=1.0, max_delta_step=0,
       max_depth=6, min_child_weight=40, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1) 2591.851957367946


In [29]:
# Training using entire data to improve accuracy

xgb_reg.fit(X, y)
predict = xgb_reg.predict(X_test)

submission['Purchase'] = predict
submission.to_csv('Sample_Submission_Tm9Lura.csv', index=False)

Public LB Score: 2574.95