In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Model
from keras.layers import *
from keras.callbacks import *
from keras.regularizers import l2
from keras.optimizers import *
from keras.utils import to_categorical
import datetime
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from keras import backend as K
from sklearn.model_selection import KFold


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['bgu-dl-assignmnt2-features-extraction', 'elo-merchant-category-recommendation']


# Define global functions

In [2]:
# define function that calaculate RMSE
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Read data

In [3]:
train_set = pd.read_csv("../input/bgu-dl-assignmnt2-features-extraction/train_set.csv")
test_set = pd.read_csv("../input/bgu-dl-assignmnt2-features-extraction/test_set.csv")
target = pd.read_csv("../input/bgu-dl-assignmnt2-features-extraction/target.csv", header=None)

print("shape of train : ",train_set.shape)
print("shape of test : ",test_set.shape)
print("shape of target : ",target.shape)

shape of train :  (201917, 143)
shape of test :  (123623, 143)
shape of target :  (201917, 1)


# Preprocessing

In [4]:
cat_col = ['feature_1','feature_2', 'feature_3', 'merchant_group_id', 'merchant_category_id', 'subsector_id', 'category_1',
          'most_recent_sales_range', 'most_recent_purchases_range', 'category_4', 'city_id', 'state_id', 'category_2']
numeric_col = train_set.columns[~train_set.columns.isin(np.append(cat_col, ['card_id', 'first_active_month']))]
used_col = np.concatenate((cat_col, numeric_col), axis=0)

In [5]:
def preprocess(trx_data):
    for cat_col_name in cat_col:
        lbl = LabelEncoder()
        lbl.fit(trx_data[cat_col_name].unique().astype('str'))
        trx_data[cat_col_name] = lbl.transform(trx_data[cat_col_name].astype('str'))
    
    for numeric_col_name in numeric_col:
        trx_data[numeric_col_name] = pd.to_numeric(trx_data[numeric_col_name])
        min_val = trx_data[numeric_col_name].min()
        max_val = trx_data[numeric_col_name].max()
        if min_val == max_val:
            trx_data[numeric_col_name] = 0
            print(numeric_col_name)
        else:
            trx_data[numeric_col_name] = (max_val - trx_data[numeric_col_name]) / (max_val - min_val)

    return trx_data

# remove nan values from data set
train_set_no_nan = train_set.fillna(-20)
test_set_no_nan = test_set.fillna(-20)

train_set = preprocess(train_set_no_nan)
test_set = preprocess(test_set_no_nan)

# split the given train set to train and test set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_set, target, test_size=0.2, random_state=24)

print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

X_train shape (161533, 143)
X_test shape (40384, 143)


In [7]:
X_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,...,purchase_amount_count_mean,purchase_amount_count_std,purchase_amount_sum_mean,purchase_amount_sum_std,purchase_amount_mean_mean,purchase_amount_mean_std,purchase_amount_min_mean,purchase_amount_min_std,purchase_amount_max_mean,purchase_amount_max_std,purchase_amount_std_mean,purchase_amount_std_std,installments_count_mean,installments_count_std,installments_sum_mean,installments_sum_std,installments_mean_mean,installments_mean_std,installments_min_mean,installments_min_std,installments_max_mean,installments_max_std,installments_std_mean,installments_std_std,category_1_purchase_amount_mean,category_1_purchase_amount_min,category_1_purchase_amount_max,category_1_purchase_amount_std,installments_purchase_amount_mean,installments_purchase_amount_min,installments_purchase_amount_max,installments_purchase_amount_std,city_id_purchase_amount_mean,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std
165743,2017-06-01,C_ID_f9f784f2e2,1,0,0,0.142857,0.545455,0.892732,4474,209,25,0.999569,1.0,0,0,0,0.999975,0.999989,0.0,0.999973,0.999984,0.0,0.999969,0.999981,0.0,1,243,24,1,0.982353,0.733333,0.612903,0.322581,0.432432,0.428571,0.999832,0.999997,0.999997,0.999997,0.999762,...,0.891594,0.969174,0.649447,0.970931,0.997793,0.999516,0.999251,0.999906,0.993452,0.99586,0.724968,0.752204,0.891594,0.969174,0.869091,0.98223,0.945508,0.997504,0.897438,0.891245,0.974685,0.993096,0.824478,0.918312,0.699159,0.699159,0.699159,1.0,0.698988,0.69919,0.80001,0.731649,0.699213,0.699657,0.800062,0.702611,0.450964,0.341156,0.622168,1.0
119952,2017-11-01,C_ID_c36d60df2c,0,0,0,0.142857,0.090909,0.95972,15444,209,25,0.999353,0.999946,0,0,0,0.999975,0.999989,0.0,0.999973,0.999985,0.0,0.999969,0.999981,0.0,1,0,6,4,0.986275,0.766667,0.66129,0.322581,0.432432,0.5,0.999832,0.999997,0.999997,0.999997,1.0,...,0.910714,0.974388,0.639327,0.979657,0.996431,0.999058,0.999381,0.99993,0.988658,0.993723,0.723861,0.752077,0.910714,0.974388,0.907197,0.980044,0.953179,0.998627,0.884615,0.891245,0.989903,0.99784,0.830142,0.920067,0.680688,0.695386,0.665997,0.502565,0.67081,0.69581,0.737911,0.694129,0.690409,0.698681,0.778252,0.689832,0.408203,0.340156,0.563943,0.500784
89339,2017-09-01,C_ID_42699e8b0c,3,0,0,0.142857,0.272727,0.933012,10743,248,20,1.0,1.0,1,0,0,0.999975,0.999991,0.0,0.999973,0.999989,0.0,0.99997,0.99999,0.0,1,0,0,0,0.984967,0.755556,0.629032,0.322581,0.432432,0.452381,0.999832,0.999997,0.999997,0.999997,0.999761,...,0.97561,0.990287,0.618216,0.992298,0.996619,0.9959,0.999415,0.99985,0.987805,0.977645,0.722544,0.746483,0.97561,0.990287,0.951515,0.979448,0.944342,0.993159,0.846154,1.0,0.982876,0.988903,0.826805,0.914651,0.665093,0.674533,0.655654,0.5137,0.665093,0.674533,0.771386,0.72041,0.665093,0.674533,0.771386,0.690267,0.335938,0.40625,0.426523,0.345697
25589,2014-05-01,C_ID_750614dd76,2,2,1,0.571429,0.636364,0.399299,21447,257,19,0.998921,1.0,0,0,0,0.999975,0.999989,0.0,0.999973,0.999986,0.0,0.999969,0.99998,0.0,1,243,24,1,0.985621,0.755556,0.645161,0.290323,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999764,...,0.993706,0.996185,0.613434,0.995642,0.99748,0.996513,0.997659,0.996654,0.997987,0.997362,0.72552,0.751617,0.993706,0.996185,0.988636,1.0,0.980121,1.0,0.923077,1.0,0.998385,1.0,0.832916,0.921105,0.699836,0.699836,0.699836,1.0,0.699836,0.699836,0.800719,1.0,0.699836,0.699836,0.800719,1.0,0.479167,0.375,0.641577,1.0
162026,2017-10-01,C_ID_18d2ec30fc,0,1,0,0.142857,0.181818,0.946147,16189,164,36,1.0,1.0,1,0,1,0.999976,0.99999,0.0,0.999973,0.999989,0.0,0.99997,0.999989,0.0,1,0,0,0,0.984314,0.744444,0.629032,0.322581,0.432432,0.452381,0.999827,0.999996,0.999995,0.999997,0.999673,...,0.924216,0.966563,0.595646,0.948842,0.976048,0.988049,0.999415,0.999909,0.908095,0.937151,0.700027,0.738455,0.924216,0.966563,0.863636,0.946954,0.933175,0.991306,0.876923,0.902719,0.972213,0.992583,0.819674,0.916848,0.69295,0.695988,0.68991,0.527256,0.671745,0.698391,0.769721,0.715174,0.688621,0.700093,0.769721,0.686026,0.435755,0.34375,0.599749,0.530636


In [8]:
X_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,...,purchase_amount_count_mean,purchase_amount_count_std,purchase_amount_sum_mean,purchase_amount_sum_std,purchase_amount_mean_mean,purchase_amount_mean_std,purchase_amount_min_mean,purchase_amount_min_std,purchase_amount_max_mean,purchase_amount_max_std,purchase_amount_std_mean,purchase_amount_std_std,installments_count_mean,installments_count_std,installments_sum_mean,installments_sum_std,installments_mean_mean,installments_mean_std,installments_min_mean,installments_min_std,installments_max_mean,installments_max_std,installments_std_mean,installments_std_std,category_1_purchase_amount_mean,category_1_purchase_amount_min,category_1_purchase_amount_max,category_1_purchase_amount_std,installments_purchase_amount_mean,installments_purchase_amount_min,installments_purchase_amount_max,installments_purchase_amount_std,city_id_purchase_amount_mean,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std
110727,2017-12-01,C_ID_3d38c16da9,2,2,1,0.142857,0.0,0.972855,7357,257,19,0.999838,1.0,0,1,1,0.999975,0.99999,0.0,0.999973,0.999987,0.0,0.999969,0.999981,0.0,1,132,23,3,0.986275,0.766667,0.66129,0.322581,0.432432,0.5,0.999832,0.999997,0.999997,0.999997,1.0,...,0.991289,0.996627,0.614055,0.996934,0.997433,0.999819,0.998963,0.999471,0.997611,0.99969,0.725356,0.752757,0.991289,0.996627,0.97475,0.998726,0.94399,0.994482,0.948715,0.874497,0.983845,1.0,0.818855,0.919937,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
108935,2017-02-01,C_ID_baa868fe6e,2,2,1,0.142857,0.909091,0.840193,7098,217,37,1.0,1.0,1,0,0,0.999975,0.999989,0.0,0.999973,0.999985,0.0,0.99997,0.999981,0.0,1,0,0,0,0.973203,0.688889,0.580645,0.258065,0.351351,0.404762,0.999834,0.999997,0.999997,0.999997,0.999764,...,0.963828,0.976614,0.624458,0.976784,0.99582,0.990204,0.995008,0.990115,0.996013,0.994189,0.725768,0.752749,0.963828,0.976614,0.939394,0.97915,0.937674,0.992505,0.840231,0.887113,0.981616,0.995523,0.825165,0.918694,0.698779,0.698779,0.698779,1.0,0.69814,0.699259,0.798452,0.730602,0.698779,0.698779,0.800017,1.0,0.453125,0.34375,0.623656,1.0
22893,2017-09-01,C_ID_2ce1c19fa6,2,0,1,0.142857,0.272727,0.933012,4047,234,29,0.999892,0.999946,0,2,2,0.999975,0.99999,0.0,0.999973,0.999985,0.0,0.999969,0.99998,0.0,0,78,24,1,0.98366,0.755556,0.645161,0.290323,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999765,...,0.960087,0.984418,0.626218,0.985436,0.998873,0.999671,0.999532,0.999929,0.998154,0.999245,0.725912,0.752876,0.960087,0.984418,0.988636,1.0,0.980121,1.0,0.923077,1.0,0.998385,1.0,0.832916,0.921105,0.699388,0.699388,0.699388,1.0,0.699388,0.699388,0.800421,1.0,0.699393,0.699403,0.800414,0.702984,0.479167,0.375,0.641577,1.0
79140,2016-04-01,C_ID_03a8c5345e,1,1,0,0.285714,0.727273,0.706217,26610,209,25,0.999784,0.999837,0,1,1,0.999975,0.999989,0.0,0.999973,0.999985,0.0,0.999969,0.999981,0.0,0,0,11,3,0.982353,0.733333,0.629032,0.322581,0.405405,0.452381,0.999832,0.999997,0.999997,0.999997,0.999764,...,0.970144,0.986712,0.620028,0.989805,0.996196,0.998248,0.99894,0.999606,0.990591,0.993408,0.723552,0.751211,0.970144,0.986712,0.988636,1.0,0.980121,1.0,0.923077,1.0,0.998385,1.0,0.832916,0.921105,0.698436,0.698436,0.698436,1.0,0.698436,0.698436,0.799789,1.0,0.697819,0.698738,0.798769,0.701759,0.479167,0.375,0.641577,1.0
12651,2017-06-01,C_ID_a51a3996c2,2,1,1,0.142857,0.545455,0.892732,25179,209,25,1.0,1.0,0,2,2,0.999975,0.999989,0.0,0.999973,0.999986,0.0,0.99997,0.999983,0.0,0,169,24,1,0.984314,0.755556,0.645161,0.322581,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999765,...,0.93358,0.980641,0.635844,0.981719,0.998741,0.999079,0.999642,0.999815,0.998013,0.998364,0.725932,0.752784,0.93358,0.980641,0.988636,1.0,0.980121,1.0,0.923077,1.0,0.998385,1.0,0.832916,0.921105,0.699174,0.699174,0.699174,1.0,0.699174,0.699174,0.800279,1.0,0.699121,0.69947,0.80001,0.702528,0.479167,0.375,0.641577,1.0


# Classic ML model to form a benchmark

In [9]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train[used_col], y_train.iloc[:,0])

lgb_pred_test = lgb_model.predict(X_test[used_col])
lgb_pred_train = lgb_model.predict(X_train[used_col])

print('test RMSE:', mean_squared_error(y_test, lgb_pred_test) ** 0.5)
print('train RMSE:', mean_squared_error(y_train, lgb_pred_train) ** 0.5)

test RMSE: 3.594209364193137
train RMSE: 3.3525398377373525


# Create embedding to categorical feature

In [10]:
embedding_col = ['feature_1', 'feature_2', 'feature_3', 'year', 'month']
other_col = [x for x in used_col if x not in embedding_col]

In [11]:
f1_unique_val = len(train_set['feature_1'].unique())
f2_unique_val = len(train_set['feature_2'].unique())
f3_unique_val = len(train_set['feature_3'].unique())
year_unique_val = len(train_set['year'].unique())
month_unique_val = len(train_set['month'].unique())

In [12]:
f1_inp = Input(shape=(1,),dtype='int64')
f2_inp = Input(shape=(1,),dtype='int64')
f3_inp = Input(shape=(1,),dtype='int64')
year_inp = Input(shape=(1,),dtype='int64')
month_inp = Input(shape=(1,),dtype='int64')

f1_emb = Embedding(f1_unique_val,2,input_length=1, embeddings_regularizer=l2(1e-6))(f1_inp)
f2_emb = Embedding(f2_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f2_inp)
f3_emb = Embedding(f3_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f3_inp)
year_emb = Embedding(year_unique_val,3,input_length=1, embeddings_regularizer=l2(1e-6))(year_inp)
month_emb = Embedding(month_unique_val,4,input_length=1, embeddings_regularizer=l2(1e-6))(month_inp)

# Predicting the target using only the categorical features embeddings

In [None]:
x = concatenate([f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='sigmoid')(x) #activation='linear'
emb_model = Model([f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)
#emb_model.compile(loss='mse',optimizer='adam')

emb_model.compile(optimizer="RMSProp", loss=root_mean_squared_error)

print(emb_model.summary())

emb_model.fit([X_train[col] for col in embedding_col], y_train, epochs=5)

emb_pred_test = emb_model.predict([X_test[col] for col in embedding_col])
emb_pred_train = emb_model.predict([X_train[col] for col in embedding_col])

print('test RMSE', mean_squared_error(y_test, emb_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train, emb_pred_train) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

# Use the embeddings we got as a “feature extractor” for a LGBMRegressor model

In [None]:
emb_output = emb_model.layers[11].output

feature_model = Model(emb_model.input, emb_output)

feature_model.compile(optimizer = "RMSProp", loss = root_mean_squared_error)
print(feature_model.summary())

featurs = feature_model.predict([X_train[col] for col in embedding_col])
features_test = feature_model.predict([X_test[col] for col in embedding_col])

lgb_model = lgb.LGBMRegressor()
lgb_model.fit(featurs, y_train.values)

lgb_pred_test = lgb_model.predict(features_test)
lgb_pred_train = lgb_model.predict(featurs)

print('test RMSE', mean_squared_error(y_test, lgb_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train, lgb_pred_train) ** 0.5)

# Add the rest of the features 

In [None]:
# define continuous input
continuous_input = Input(shape=(len(other_col),))

# define categorical input                         
f1_emb = Reshape((2,))(f1_emb)
f2_emb = Reshape((1,))(f2_emb)
f3_emb = Reshape((1,))(f3_emb)
year_emb = Reshape((3,))(year_emb)
month_emb = Reshape((4,))(month_emb)
                         
#split train set to train and validation set
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=6)
                         
# define function to create input to model
def get_input(data):
    inp = [data[other_col], data['feature_1'], data['feature_2'], data['feature_3'], data['year'], data['month']]
    return inp

In [None]:
x = concatenate([continuous_input,f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='linear')(x)
emb_cont_model = Model([continuous_input,f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)

rmsprop_opt = RMSprop(lr=0.005)
emb_cont_model.compile(optimizer = rmsprop_opt, loss = root_mean_squared_error)

print(emb_cont_model.summary())

'''def set_callbacks(description='run1',patience=15,tb_base_logdir='./logs/'):
    cp = ModelCheckpoint('best_model_weights_{}.h5'.format(description),save_best_only=True)
    es = EarlyStopping(patience=patience,monitor='val_loss')
    rlop = ReduceLROnPlateau(patience=5)   
    tb = TensorBoard(log_dir='{}{}'.format(tb_base_logdir,description))
    cb = [cp,es,tb,rlop]
    return cb'''

history = emb_cont_model.fit(get_input(X_train_val), y_train_val, epochs=5, batch_size=16, 
          validation_data=(get_input(X_val), y_val)) #callbacks=set_callbacks()

emb_cont_pred_test = emb_cont_model.predict(get_input(X_test))
emb_cont_pred_train = emb_cont_model.predict(get_input(X_train_val))

print('test RMSE', mean_squared_error(y_test, emb_cont_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train_val, emb_cont_pred_train) ** 0.5)