In [1]:
import numpy as np
import pandas as pd 
from collections import defaultdict
from pprint import pprint
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
 
matplotlib.rcParams['figure.figsize'] = (8, 6)

import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Merge
from keras.layers import Convolution1D
from keras.layers import Reshape
from keras.layers import Flatten
seed = 0
np.random.seed(seed)

import time
import operator

Using TensorFlow backend.


In [2]:
demographic_cols = ['customerID', 'Gender', 'State']

notuse = ['transactionDate']

product_col = ['100105505.0', '108000537.0', '108000568.0', '108000707.0',
       '108001125.0', '108001127.0', '108003448.0', '108003451.0',
       '108004423.0', '108004880.0', '108004977.0', '108005676.0',
       '108005681.0', '108008866.0', '108010225.0', '108015124.0',
       '108015133.0', '108017242.0', '108017245.0', '108018729.0',
       '108018730.0', '108018925.0', '108020429.0', '108030063.0',
       '108030113.0', '108032048.0', '108037499.0', '108037504.0',
       '108037568.0', '108043278.0', '108100137.0', '108100143.0',
       '108100145.0', '108100183.0', '108100193.0', '108100243.0',
       '108100253.0', '108100267.0', '108100278.0', '108100288.0',
       '108100290.0', '108100294.0', '108100296.0', '108100297.0',
       '108100298.0', '108100302.0', '108100306.0', '108100308.0',
       '108100319.0', '108100328.0', '108100330.0', '108100331.0',
       '108100334.0', '108100335.0', '108100340.0', '108100352.0',
       '108100361.0', '108100362.0', '108100382.0', '300028156.0',
       '300070102.0', '300088564.0', '300111517.0', '300143077.0',
       '300144423.0', '300150379.0', '300157247.0', '300177372.0',
       '300317078.0', '300326493.0', '300347917.0', '300361324.0',
       '300374464.0', '300481673.0', '300481740.0', '300481741.0',
       '300497842.0', '300528552.0', '300616595.0', '300647581.0',
       '300676075.0', '300680079.0', '300685217.0', '300776409.0',
       '300776410.0', '300776411.0', '300781593.0', '300785147.0',
       '300785148.0', '300785150.0', '300808923.0', '300840018.0',
       '300942704.0', '300974316.0', '300974360.0', '500096181.0',
       '1000010411.0', '1000058092.0', '1000336252.0', '1000609658.0']

In [3]:
df_train = pd.read_csv('cleaned_data/DataMulticlass_6_withpast2.csv')

In [4]:
df_test = pd.read_csv('cleaned_data/TestSet_withpast3.csv')

In [5]:
state_dict = {'MADHY PRADESH':'MADHYA PRADESH', 'TAMILNADU':'TAMIL NADU', 'MADHYA  PRADESH':'MADHYA PRADESH', 'HARAYANA':'HARYANA',
             'Jharkhand':'JHARKHAND','Tamilnadu':'TAMIL NADU','Tamil Nadu':'TAMIL NADU','Madhya Pradesh':'MADHYA PRADESH',
             'REST OF WEST BENGAL':'WEST BENGAL', 'west bengal':'WEST BENGAL','Uttar Pradesh':'UTTAR PRADESH', 'Delhi':'DELHI',
             'Bhopal':'BHOPAL','CHHATISGARH':'CHHATTISGARH','CHATTISGARH':'CHHATTISGARH', 'jharkhand':'JHARKHAND','Chandigarh':'CHANDIGARH',
             'UTTAR PRADESH WEST': 'UTTAR PRADESH','ODISHA':'ORISSA','MAHARASTRA':'MAHARASHTRA','madhya pradesh':'MADHYA PRADESH',
             'KARNATAK':'KARNATAKA','JAMMU and KASHMIR':'JAMMU AND KASHMIR','JAMMU KASHMIR':'JAMMU AND KASHMIR','Rajasthan':'RAJASTHAN',
             'east singhbhum':'JHARKHAND', 'ORRISA':'ORISSA','Andhra Pradesh':'ANDHRA PRADESH', 'UTTARANCHAL':'UTTARAKHAND',
             'Uttar pradesh':'UTTAR PRADESH','Maharashtra':'MAHARASHTRA','MP':'MADHYA PRADESH', 'UTTAR PRADESH EAST':'UTTAR PRADESH',
             'Punjab':'PUNJAB','maharashtra':'MAHARASHTRA','Karnataka':'KARNATAKA','M.P.':'MADHYA PRADESH','DAMAN':'DAMAN AND DIU',
             'HUBLI':'KARNATAKA','Tamil nadu':'TAMIL NADU','GUJRAT':'GUJARAT', 'Mp':'MADHYA PRADESH','Madhya pradesh':'MADHYA PRADESH',
             'West Bengal':'WEST BENGAL','Gujarat':'GUJARAT','UP':'UTTAR PRADESH','Chennai':'CHENNAI', 'm.p.':'MADHYA PRADESH',
             'kerala':'KERALA'}

df_train.replace({"State": state_dict}, inplace=True)
df_test.replace({"State": state_dict}, inplace=True)

In [6]:
df_test = df_test[df_test['State'].isin(df_train['State'])]

In [7]:
# These column are categories feature, I'll transform them using get_dummy
dummy_col = ['Gender','State']
dummy_col_select = ['Gender','State']

In [8]:
limit = int(0.01 * len(df_train.index))
use_dummy_col = {}

for col in dummy_col_select:
    trainlist = df_train[col].value_counts()
    use_dummy_col[col] = []
    for i,item in enumerate(trainlist):
        if item > limit:
            use_dummy_col[col].append(df_train[col].value_counts().index[i])
            
def get_dummy(df):
    for col in dummy_col_select:
        for item in df[col].unique(): 
            if item not in use_dummy_col[col]:
                row_index = df[col] == item
                df.loc[row_index,col] = np.nan
    return pd.get_dummies(df, prefix=dummy_col, columns = dummy_col)
    
df_train = get_dummy(df_train)
df_test = get_dummy(df_test)

In [9]:
product_col_5 = [col for col in df_train.columns if '.0_5' in col]
product_col_4 = [col for col in df_train.columns if '.0_4' in col]
product_col_3 = [col for col in df_train.columns if '.0_3' in col]
product_col_2 = [col for col in df_train.columns if '.0_2' in col]
product_col_1 = [col for col in df_train.columns if '.0_1' in col]


df_train['tot5'] = df_train[product_col_5].sum(axis=1)
df_test['tot5'] = df_test[product_col_5].sum(axis=1)

for col in product_col[2:]:
    df_train[col+'_past'] = (df_train[col+'_5']+df_train[col+'_4']+df_train[col+'_3']+df_train[col+'_2']+df_train[col+'_1'])/5
    df_test[col+'_past'] = (df_test[col+'_5']+df_test[col+'_4']+df_test[col+'_3']+df_test[col+'_2']+df_test[col+'_1'])/5
    
for pro in product_col[2:]:
    df_train[pro+'_past'] = df_train[pro+'_past']*(1-df_train[pro+'_5'])
    df_test[pro+'_past'] = df_test[pro+'_past']*(1-df_test[pro+'_5'])

In [10]:
col1 = product_col_1 + product_col_2 + product_col_3 + product_col_4 + product_col_5
col2 = list(df_train.drop(['target','customerID']+col1, 1).columns.values)

x_train1 = df_train[col1].as_matrix()
x_test1 = df_test[col1].as_matrix()

x_train1 = np.reshape(x_train1, (len(x_train1), 5, 100))
x_test1 = np.reshape(x_test1, (len(x_test1), 5, 100))

x_train2 = df_train[col2].as_matrix()
x_test2 = df_test[col2].as_matrix()

y_train = pd.get_dummies(df_train['target'].astype(int)).as_matrix()

In [12]:
id_preds1 = defaultdict(list)
ids = df_test['customerID'].values
  
### product-wise
model1 = Sequential()
model1.add(Convolution1D(60, 100, border_mode='same', input_shape=(5, 100),activation = 'relu'))
model1.add(Flatten())

# time-wise
model2 = Sequential()
model2.add(Convolution1D(30, 5, border_mode='same', input_shape=(100, 5),activation = 'relu'))
model2.add(Flatten())

# domegraphic-wise
model3 = Sequential()
model3.add(Dense(150, input_dim=len(col2), init='uniform', activation='relu'))

merged = Merge([model1,model2,model3], mode='concat')

final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(40, init='uniform', activation='softmax'))
final_model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['categorical_accuracy'])
print(final_model.summary())

history = final_model.fit([x_train1, x_train1.transpose((0, 2, 1)), x_train2], y_train, nb_epoch=100, batch_size=100, verbose = 2)
        
p_test = final_model.predict([x_test1, x_test1.transpose((0, 2, 1)), x_test2])
        
for id, p in zip(ids, p_test):
    #id_preds[id] = list(p)
    id_preds1[id] = [0,0] + list(p)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution1d_1 (Convolution1D)  (None, 5, 60)         600060                                       
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 300)           0                                            
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 100, 30)       780                                          
____________________________________________________________________________________________________
flatten_2 (Flatten)              (None, 3000)          0                                            
___________________________________________________________________________________________

In [19]:
def runXGB(train_X, train_y, reg =100, colsample_bytree=0.9, max_depth= 6, eta=0.1, min_child_weight=2, subsample=0.9, num_rounds=150):
    param = {}
    param['objective'] = 'multi:softprob'
    param['seed'] = 0
    param['silent'] = 0
    param['eval_metric'] = "mlogloss"
    param['booster'] = 'gbtree'
    param['num_class'] = 100
    param['reg_lambda'] = reg
    param['colsample_bytree'] = colsample_bytree
    param['max_depth'] = max_depth 
    param['eta'] = eta
    param['min_child_weight'] = min_child_weight
    param['subsample'] = subsample
    num_rounds = num_rounds

    progress = dict()
    plst = list(param.items())
    
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    watchlist  = [(xgtrain,'train')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, evals_result=progress)
    
    #xgb.cv(param, xgtrain, num_rounds, nfold=3,
    #   metrics={'mlogloss'}, seed = 0,
    #   callbacks=[xgb.callback.print_evaluation(show_stdv=True)])
    
    return (model, progress)

In [20]:
cols = list(df_train.drop(['target','customerID'], 1).columns.values)
#cols= selected_col

id_preds2 = defaultdict(list)
ids = df_test['customerID'].values

# predict model 
y_train = df_train['target']
x_train = df_train[cols]
    
(clf, progress) = runXGB(x_train, y_train, reg =50, eta=0.05,min_child_weight=10, num_rounds=194)
          
x_test = df_test[cols]
x_test = x_test.fillna(0) 
        
d_test = xgb.DMatrix(x_test)
p_test = clf.predict(d_test)
        
for id, p in zip(ids, p_test):
    #id_preds[id] = list(p)
    id_preds2[id] = [0,0] + list(p)

[0]	train-mlogloss:4.12696
[1]	train-mlogloss:3.85133
[2]	train-mlogloss:3.64575
[3]	train-mlogloss:3.47763
[4]	train-mlogloss:3.33147
[5]	train-mlogloss:3.20673
[6]	train-mlogloss:3.0961
[7]	train-mlogloss:2.99172
[8]	train-mlogloss:2.90158
[9]	train-mlogloss:2.81699
[10]	train-mlogloss:2.74004
[11]	train-mlogloss:2.6717
[12]	train-mlogloss:2.60716
[13]	train-mlogloss:2.54324
[14]	train-mlogloss:2.48767
[15]	train-mlogloss:2.43213
[16]	train-mlogloss:2.37978
[17]	train-mlogloss:2.33
[18]	train-mlogloss:2.28591
[19]	train-mlogloss:2.24297
[20]	train-mlogloss:2.20456
[21]	train-mlogloss:2.16554
[22]	train-mlogloss:2.12888
[23]	train-mlogloss:2.09405
[24]	train-mlogloss:2.06138
[25]	train-mlogloss:2.03228
[26]	train-mlogloss:2.00317
[27]	train-mlogloss:1.97605
[28]	train-mlogloss:1.95043
[29]	train-mlogloss:1.92473
[30]	train-mlogloss:1.89974
[31]	train-mlogloss:1.87708
[32]	train-mlogloss:1.85505
[33]	train-mlogloss:1.83341
[34]	train-mlogloss:1.81317
[35]	train-mlogloss:1.79342
[36]	tr

In [21]:
product_list = df_test[product_col_5].sum(axis=0)/(df_test[product_col_5].sum(axis=0).sum())

id_preds5 = {}
for row in df_test.values:
    id = row[0]
    id_preds5[id] = [0,0]+ list(product_list)

In [22]:
fractionKeras = 0.3
fractionXGB = 0.7
fractionRanking = 0.1
id_preds_combined = {}

for uid, p in id_preds1.items():
    id_preds_combined[uid] = fractionKeras*np.asarray(id_preds1[uid]) + fractionXGB*np.asarray(id_preds2[uid]) + fractionRanking*np.asarray(id_preds5[uid])
    
id_preds = id_preds_combined

ValueError: operands could not be broadcast together with shapes (42,) (102,) 

In [23]:
sample = pd.read_csv('sampleSubmission.csv')