In [1]:

import numpy as np
import pandas as pd 
from collections import defaultdict
from pprint import pprint
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
 
matplotlib.rcParams['figure.figsize'] = (8, 6)

import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
seed = 0
np.random.seed(seed)

Using TensorFlow backend.


In [2]:
demographic_cols = ['customerID', 'Gender', 'State']

notuse = ['transactionDate']

product_col = ['100105505.0', '108000537.0', '108000568.0', '108000707.0',
       '108001125.0', '108001127.0', '108003448.0', '108003451.0',
       '108004423.0', '108004880.0', '108004977.0', '108005676.0',
       '108005681.0', '108008866.0', '108010225.0', '108015124.0',
       '108015133.0', '108017242.0', '108017245.0', '108018729.0',
       '108018730.0', '108018925.0', '108020429.0', '108030063.0',
       '108030113.0', '108032048.0', '108037499.0', '108037504.0',
       '108037568.0', '108043278.0', '108100137.0', '108100143.0',
       '108100145.0', '108100183.0', '108100193.0', '108100243.0',
       '108100253.0', '108100267.0', '108100278.0', '108100288.0',
       '108100290.0', '108100294.0', '108100296.0', '108100297.0',
       '108100298.0', '108100302.0', '108100306.0', '108100308.0',
       '108100319.0', '108100328.0', '108100330.0', '108100331.0',
       '108100334.0', '108100335.0', '108100340.0', '108100352.0',
       '108100361.0', '108100362.0', '108100382.0', '300028156.0',
       '300070102.0', '300088564.0', '300111517.0', '300143077.0',
       '300144423.0', '300150379.0', '300157247.0', '300177372.0',
       '300317078.0', '300326493.0', '300347917.0', '300361324.0',
       '300374464.0', '300481673.0', '300481740.0', '300481741.0',
       '300497842.0', '300528552.0', '300616595.0', '300647581.0',
       '300676075.0', '300680079.0', '300685217.0', '300776409.0',
       '300776410.0', '300776411.0', '300781593.0', '300785147.0',
       '300785148.0', '300785150.0', '300808923.0', '300840018.0',
       '300942704.0', '300974316.0', '300974360.0', '500096181.0',
       '1000010411.0', '1000058092.0', '1000336252.0', '1000609658.0']

In [3]:
df_train = pd.read_csv('cleaned_data/DataMulticlass_6_withpast2.csv')

In [4]:
df_test = pd.read_csv('cleaned_data/TestSet_withpast3.csv')

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
state_dict = {'MADHY PRADESH':'MADHYA PRADESH', 'TAMILNADU':'TAMIL NADU', 'MADHYA  PRADESH':'MADHYA PRADESH', 'HARAYANA':'HARYANA',
             'Jharkhand':'JHARKHAND','Tamilnadu':'TAMIL NADU','Tamil Nadu':'TAMIL NADU','Madhya Pradesh':'MADHYA PRADESH',
             'REST OF WEST BENGAL':'WEST BENGAL', 'west bengal':'WEST BENGAL','Uttar Pradesh':'UTTAR PRADESH', 'Delhi':'DELHI',
             'Bhopal':'BHOPAL','CHHATISGARH':'CHHATTISGARH','CHATTISGARH':'CHHATTISGARH', 'jharkhand':'JHARKHAND','Chandigarh':'CHANDIGARH',
             'UTTAR PRADESH WEST': 'UTTAR PRADESH','ODISHA':'ORISSA','MAHARASTRA':'MAHARASHTRA','madhya pradesh':'MADHYA PRADESH',
             'KARNATAK':'KARNATAKA','JAMMU and KASHMIR':'JAMMU AND KASHMIR','JAMMU KASHMIR':'JAMMU AND KASHMIR','Rajasthan':'RAJASTHAN',
             'east singhbhum':'JHARKHAND', 'ORRISA':'ORISSA','Andhra Pradesh':'ANDHRA PRADESH', 'UTTARANCHAL':'UTTARAKHAND',
             'Uttar pradesh':'UTTAR PRADESH','Maharashtra':'MAHARASHTRA','MP':'MADHYA PRADESH', 'UTTAR PRADESH EAST':'UTTAR PRADESH',
             'Punjab':'PUNJAB','maharashtra':'MAHARASHTRA','Karnataka':'KARNATAKA','M.P.':'MADHYA PRADESH','DAMAN':'DAMAN AND DIU',
             'HUBLI':'KARNATAKA','Tamil nadu':'TAMIL NADU','GUJRAT':'GUJARAT', 'Mp':'MADHYA PRADESH','Madhya pradesh':'MADHYA PRADESH',
             'West Bengal':'WEST BENGAL','Gujarat':'GUJARAT','UP':'UTTAR PRADESH','Chennai':'CHENNAI', 'm.p.':'MADHYA PRADESH',
             'kerala':'KERALA'}

In [7]:
df_train.replace({"State": state_dict}, inplace=True)
df_test.replace({"State": state_dict}, inplace=True)

In [8]:
df_test = df_test[df_test['State'].isin(df_train['State'])]

In [9]:
# These column are categories feature, I'll transform them using get_dummy
dummy_col = ['Gender','State']
dummy_col_select = ['Gender','State']

In [10]:
limit = int(0.01 * len(df_train.index))
use_dummy_col = {}

for col in dummy_col_select:
    trainlist = df_train[col].value_counts()
    use_dummy_col[col] = []
    for i,item in enumerate(trainlist):
        if item > limit:
            use_dummy_col[col].append(df_train[col].value_counts().index[i])

In [11]:
def get_dummy(df):
    for col in dummy_col_select:
        for item in df[col].unique(): 
            if item not in use_dummy_col[col]:
                row_index = df[col] == item
                df.loc[row_index,col] = np.nan
    return pd.get_dummies(df, prefix=dummy_col, columns = dummy_col)
    
df_train = get_dummy(df_train)
df_test = get_dummy(df_test)

In [12]:
product_col_5 = [col for col in df_train.columns if '.0_5' in col]
product_col_4 = [col for col in df_train.columns if '.0_4' in col]
product_col_3 = [col for col in df_train.columns if '.0_3' in col]
product_col_2 = [col for col in df_train.columns if '.0_2' in col]
product_col_1 = [col for col in df_train.columns if '.0_1' in col]


In [13]:
df_train['tot5'] = df_train[product_col_5].sum(axis=1)
df_test['tot5'] = df_test[product_col_5].sum(axis=1)

In [14]:
for col in product_col[2:]:
    df_train[col+'_past'] = (df_train[col+'_5']+df_train[col+'_4']+df_train[col+'_3']+df_train[col+'_2']+df_train[col+'_1'])/5
    df_test[col+'_past'] = (df_test[col+'_5']+df_test[col+'_4']+df_test[col+'_3']+df_test[col+'_2']+df_test[col+'_1'])/5

In [15]:
for pro in product_col[2:]:
    df_train[pro+'_past'] = df_train[pro+'_past']*(1-df_train[pro+'_5'])
    df_test[pro+'_past'] = df_test[pro+'_past']*(1-df_test[pro+'_5'])

In [16]:
for col in product_col[2:]:
    for month in range(2,6):
        df_train[col+'_'+str(month)+'_diff'] = df_train[col+'_'+str(month)] - df_train[col+'_'+str(month-1)]
        df_test[col+'_'+str(month)+'_diff'] = df_test[col+'_'+str(month)] - df_test[col+'_'+str(month-1)]
        df_train[col+'_'+str(month)+'_add'] = df_train[col+'_'+str(month)+'_diff'].apply(lambda x: max(x,0))
        df_test[col+'_'+str(month)+'_add'] = df_test[col+'_'+str(month)+'_diff'].apply(lambda x: max(x,0))

In [17]:
product_col_5_diff = [col for col in df_train.columns if '5_diff' in col]
product_col_4_diff = [col for col in df_train.columns if '4_diff' in col]
product_col_3_diff = [col for col in df_train.columns if '3_diff' in col]
product_col_2_diff = [col for col in df_train.columns if '2_diff' in col]

product_col_5_add = [col for col in df_train.columns if '5_add' in col]
product_col_4_add = [col for col in df_train.columns if '4_add' in col]
product_col_3_add = [col for col in df_train.columns if '3_add' in col]
product_col_2_add = [col for col in df_train.columns if '2_add' in col]

product_col_all_diff = [col for col in df_train.columns if '_diff' in col]
product_col_all_add = [col for col in df_train.columns if '_add' in col]

In [18]:
df_train['tot5_add'] = df_train[product_col_5_add].sum(axis=1)
df_test['tot5_add'] = df_test[product_col_5_add].sum(axis=1)
df_train['tot4_add'] = df_train[product_col_4_add].sum(axis=1)
df_test['tot4_add'] = df_test[product_col_4_add].sum(axis=1)
df_train['tot3_add'] = df_train[product_col_3_add].sum(axis=1)
df_test['tot3_add'] = df_test[product_col_3_add].sum(axis=1)
df_train['tot2_add'] = df_train[product_col_2_add].sum(axis=1)
df_test['tot2_add'] = df_test[product_col_2_add].sum(axis=1)

In [19]:
cols = list(df_train.drop(['target','customerID']+product_col_all_diff+product_col_all_add, 1).columns.values)

id_preds = defaultdict(list)
ids = df_test['customerID'].values

# predict model 
y_train = pd.get_dummies(df_train['target'].astype(int))
x_train = df_train[cols]
    
# create model
model = Sequential()
model.add(Dense(150, input_dim=len(cols), init='uniform', activation='relu'))
model.add(Dense(40, init='uniform', activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['categorical_accuracy'])

#model.fit(x_train.as_matrix(), y_train.as_matrix(), validation_split=0.2, nb_epoch=150, batch_size=10)
model.fit(x_train.as_matrix(), y_train.as_matrix(), nb_epoch=10, batch_size=10)

x_test = df_test[cols]
x_test = x_test.fillna(0) 
        
p_test = model.predict(x_test.as_matrix())
        
for id, p in zip(ids, p_test):
    #id_preds[id] = list(p)
    id_preds[id] = [0,0] + list(p)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
product_list = df_train[product_col_5].sum(axis=0)/(df_train[product_col_5].sum(axis=0).sum())

id_preds2 = {}
for row in df_test.values:
    id = row[0]
    id_preds2[id] = [0,0]+ list(product_list)

In [21]:
fraction = 0.9
id_preds_combined = {}

for uid, p in id_preds.items():
    id_preds_combined[uid] = fraction*np.asarray(id_preds[uid]) + (1-fraction)*np.asarray(id_preds2[uid])
    
id_preds = id_preds_combined

ValueError: operands could not be broadcast together with shapes (42,) (102,) 

In [22]:
# check if customer already have each product or not. 
already_active = {}
for row in df_train.values:
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(df_train.columns[1:], row) if c[1] > 0]
    already_active[id] = active

# add 7 products(that user don't have yet), higher probability first -> train_pred   
train_preds = {}
for id, p in id_preds.items():
    
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(df_train.columns[1:], p) if i[0] not in already_active[id]],
                                  key=lambda i:i [1], 
                                  reverse=True)[:7]]
    train_preds[id] = preds
    
test_preds = []
for row in sample.values:
    id = row[0]
    p = train_preds[id]
    test_preds.append(' '.join(p))

KeyError: 'BBID_20453837'