In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Model
from keras.layers import *
from keras.callbacks import *
from keras.regularizers import l2
from keras.optimizers import *
from keras.utils import to_categorical
import datetime
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from keras import backend as K
from sklearn.model_selection import KFold


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['train.csv', 'merchants.csv', 'sample_submission.csv', 'test.csv', 'historical_transactions.csv', 'Data_Dictionary.xlsx', 'new_merchant_transactions.csv']


# Define global functions

In [2]:
# define function to reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# define function that calaculate RMSE
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Read data

In [3]:
train_set = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
test_set = pd.read_csv("../input/test.csv", parse_dates=["first_active_month"])
history_trx = pd.read_csv("../input/historical_transactions.csv", parse_dates=['purchase_date'])
new_trx = pd.read_csv("../input/new_merchant_transactions.csv", parse_dates=['purchase_date'])
#merchants_set = pd.read_csv("../input/merchants.csv")

print("shape of train : ",train_set.shape)
print("shape of test : ",test_set.shape)
print("shape of history_trx : ",history_trx.shape)
print("shape of new_trx : ",new_trx.shape)
#print("shape of merchants : ",merchants_set.shape)

shape of train :  (201917, 6)
shape of test :  (123623, 5)
shape of history_trx :  (29112361, 14)
shape of new_trx :  (1963031, 14)


# Feature extrection - collect information per card to form card_id profile

In [4]:
# add 'year', 'month', and 'elepsed_time' features to the dataframe
for df in [train_set, test_set]:
    df['year'] = df['first_active_month'].dt.year
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days # 1/2/2018 is the max date in train set
    
# create set of columns name that is numeric
numeric_col = ['elapsed_time']

# split the train set to features and target
target = train_set['target']
del train_set['target']

In [5]:
# define aggregation function that collect information from data features to crate profile to each card_id
def agg_data_trx(trx_data, col_name):
    
    trx_data['authorized_flag'] = trx_data['authorized_flag'].map({'Y':1, 'N':0})
    
    trx_data['purchase_month'] = trx_data['purchase_date'].dt.month
    
    trx_data['month_diff'] = ((datetime.datetime.today() - trx_data['purchase_date']).dt.days)//30
    trx_data['month_diff'] += trx_data['month_lag']
    
    trx_data = reduce_mem_usage(trx_data)
    
    trx_data.loc[:, 'purchase_date'] = pd.DatetimeIndex(trx_data['purchase_date']).astype(np.int64) * 1e-9
    
    agg_func = {
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }
    
    agg_data = trx_data.groupby(['card_id']).agg(agg_func)
    agg_data.columns = [col_name + '_' + '_'.join(col).strip() for col in agg_data.columns.values]
    agg_data.reset_index(inplace=True)
    
    df = (trx_data.groupby('card_id').size().reset_index(name=col_name + '_trx_count'))
    
    agg_data = pd.merge(df, agg_data, on='card_id', how='left')
    
    agg_numeric_col = [col for col in agg_data.columns if col not in ['card_id']]
    numeric_col.extend(agg_numeric_col)
    
    return agg_data

authorized_trx = history_trx[history_trx['authorized_flag'] == 'Y']
history_trx = history_trx[history_trx['authorized_flag'] == 'N']

history_trx_per_card = agg_data_trx(history_trx, 'history')
authorized_trx_per_card = agg_data_trx(authorized_trx, 'auto')
new_trx_per_card = agg_data_trx(new_trx, 'new')

# merge the new features for each card_id with the 3 basic features in train set and test set
train_set = pd.merge(train_set, history_trx_per_card, on='card_id', how='left')
test_set = pd.merge(test_set, history_trx_per_card, on='card_id', how='left')

train_set = pd.merge(train_set, authorized_trx_per_card, on='card_id', how='left')
test_set = pd.merge(test_set, authorized_trx_per_card, on='card_id', how='left')

train_set = pd.merge(train_set, new_trx_per_card, on='card_id', how='left')
test_set = pd.merge(test_set, new_trx_per_card, on='card_id', how='left')

# delete unnecessary dataframes to reduce memory usage
del history_trx_per_card
del new_trx_per_card
del authorized_trx_per_card
gc.collect()

# define the columns that are going to enter to the model
used_col = train_set.columns
used_col =  [col for col in used_col if col not in ['card_id', 'first_active_month']]

Mem. usage decreased to 158.42 Mb (51.5% reduction)
Mem. usage decreased to 1623.26 Mb (52.9% reduction)
Mem. usage decreased to 104.84 Mb (56.2% reduction)


In [6]:
train_set.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,auto_trx_count,auto_merchant_id_nunique,auto_merchant_category_id_nunique,auto_state_id_nunique,auto_city_id_nunique,auto_subsector_id_nunique,auto_purchase_amount_sum,auto_purchase_amount_mean,auto_purchase_amount_max,auto_purchase_amount_min,auto_purchase_amount_std,auto_installments_sum,auto_installments_mean,auto_installments_max,auto_installments_min,auto_installments_std,auto_purchase_date_ptp,auto_purchase_date_min,auto_purchase_date_max,auto_month_lag_mean,auto_month_lag_max,auto_month_lag_min,auto_month_lag_std,auto_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
0,2017-06-01,C_ID_92a2005557,5,2,1,2017,6,245,13.0,12.0,10.0,1.0,2.0,7.0,-8.571723,-0.659363,-0.431922,-0.737892,0.098851,4.0,0.307692,1.0,0.0,0.480384,14254523.0,1500131000.0,1514385000.0,-4.461538,-2.0,-7.0,1.664101,9.846154,247,93,41,3,7,21,-157.375,-0.637207,2.257812,-0.739258,0.216553,0,0.0,0,0,0.0,20977987.0,1498573000.0,1519551000.0,-3.882591,0,-8,2.429155,9.825911,23.0,23.0,14.0,1.0,3.0,10.0,-13.242188,-0.575684,-0.296143,-0.724609,0.135742,0.0,0.0,0.0,0.0,0.0,4742309.0,1520259000.0,1525001000.0,1.478261,2.0,1.0,0.510754,9.695652
1,2017-01-01,C_ID_3d0044924f,4,1,0,2017,1,396,11.0,9.0,9.0,2.0,2.0,9.0,-1.122886,-0.102081,1.942838,-0.740897,0.785906,42.0,3.818182,10.0,1.0,3.487641,25890841.0,1488576000.0,1514467000.0,-4.454545,-1.0,-10.0,2.696799,10.636364,339,141,57,3,9,24,-208.875,-0.616211,4.628906,-0.742188,0.355469,501,1.477876,10,-1,1.350634,33717687.0,1483720000.0,1517438000.0,-5.050147,0,-12,3.836969,10.876106,6.0,6.0,5.0,1.0,1.0,4.0,-4.355469,-0.726074,-0.70166,-0.739258,0.014381,6.0,1.0,1.0,1.0,0.0,4887632.0,1517505000.0,1522393000.0,1.5,2.0,1.0,0.547723,10.833333
2,2016-08-01,C_ID_d639edf6cd,2,2,0,2016,8,549,2.0,1.0,1.0,1.0,1.0,1.0,-1.338967,-0.669484,-0.637515,-0.701453,0.045211,0.0,0.0,0.0,0.0,0.0,4922885.0,1487878000.0,1492801000.0,-11.0,-10.0,-12.0,1.414214,10.0,41,13,8,2,5,7,-27.828125,-0.678711,-0.145874,-0.72998,0.089233,0,0.0,0,0,0.0,35635623.0,1484123000.0,1519759000.0,-8.487805,0,-13,3.893083,9.853659,1.0,1.0,1.0,1.0,1.0,1.0,-0.700195,-0.700195,-0.700195,-0.700195,,0.0,0.0,0.0,0.0,,0.0,1524937000.0,1524937000.0,2.0,2.0,2.0,,9.0
3,2017-09-01,C_ID_186d6a6901,4,3,0,2017,9,153,,,,,,,,,,,,,,,,,,,,,,,,,77,50,25,5,7,13,-49.5,-0.642578,1.445312,-0.740723,0.261475,84,1.090909,3,-1,0.588974,13375339.0,1506443000.0,1519818000.0,-2.831169,0,-5,1.802065,9.792208,7.0,7.0,6.0,2.0,2.0,5.0,-4.65625,-0.665039,-0.566895,-0.734375,0.065918,5.0,0.714286,1.0,-1.0,0.755929,3625505.0,1520424000.0,1524049000.0,1.714286,2.0,1.0,0.48795,10.0
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,2017,11,92,5.0,2.0,2.0,2.0,2.0,2.0,20.352808,4.070562,7.193041,-0.512945,4.18495,38.0,7.6,12.0,1.0,6.024948,3274330.0,1516485000.0,1519759000.0,-0.4,0.0,-1.0,0.547723,9.4,128,65,26,6,6,17,-69.0625,-0.539551,6.992188,-0.746094,0.737305,144,1.125,12,1,1.003929,9405641.0,1510445000.0,1519850000.0,-1.320312,0,-3,1.02668,9.773438,36.0,36.0,17.0,5.0,5.0,10.0,-19.921875,-0.553711,0.450928,-0.739258,0.223877,35.0,0.972222,2.0,-1.0,0.376913,4949682.0,1519992000.0,1524941000.0,1.555556,2.0,1.0,0.503953,9.833333


# Write train and test sets for future use

In [None]:
train_set.to_csv('train_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)
target.to_csv('target.csv', index=False)

# Preprocessing

In [7]:
cat_col = train_set.columns
cat_col = [col for col in cat_col if col not in np.concatenate((['card_id', 'first_active_month'], numeric_col), axis=0)]

numeric_col =  [col for col in numeric_col if col not in ['card_id', 'first_active_month']]

In [8]:
def preprocess(trx_data):
    for cat_col_name in cat_col:
        lbl = LabelEncoder()
        lbl.fit(trx_data[cat_col_name].unique().astype('str'))
        trx_data[cat_col_name] = lbl.transform(trx_data[cat_col_name].astype('str'))
    
    for numeric_col_name in numeric_col:
        trx_data[numeric_col_name] = pd.to_numeric(trx_data[numeric_col_name])
        min_val = trx_data[numeric_col_name].min()
        max_val = trx_data[numeric_col_name].max()
        if min_val == max_val:
            trx_data[numeric_col_name] = 0
            print(numeric_col_name)
        else:
            trx_data[numeric_col_name] = (max_val - trx_data[numeric_col_name]) / (max_val - min_val)

    return trx_data

# remove nan values from data set
train_set_no_nan = train_set.fillna(-20)
test_set_no_nan = test_set.fillna(-20)

train_set = preprocess(train_set_no_nan)
test_set = preprocess(test_set_no_nan)

# split the given train set to train and test set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_set, target, test_size=0.2, random_state=24)

print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

X_train shape (161533, 80)
X_test shape (40384, 80)


In [10]:
X_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,auto_trx_count,auto_merchant_id_nunique,auto_merchant_category_id_nunique,auto_state_id_nunique,auto_city_id_nunique,auto_subsector_id_nunique,auto_purchase_amount_sum,auto_purchase_amount_mean,auto_purchase_amount_max,auto_purchase_amount_min,auto_purchase_amount_std,auto_installments_sum,auto_installments_mean,auto_installments_max,auto_installments_min,auto_installments_std,auto_purchase_date_ptp,auto_purchase_date_min,auto_purchase_date_max,auto_month_lag_mean,auto_month_lag_max,auto_month_lag_min,auto_month_lag_std,auto_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
165743,2017-06-01,C_ID_f9f784f2e2,1,0,0,6,8,0.892732,0.982353,0.733333,0.612903,0.322581,0.432432,0.428571,0.999832,0.999997,0.999997,0.999997,0.999762,0.984151,0.978691,0.977429,0.979392,0.971186,0.756066,0.012135,0.006264,0.257143,0.15,0.35,0.27327,0.289116,0.908876,0.800505,0.585106,0.947368,0.878788,0.454545,0.693359,0.997559,0.994141,0.999512,0.997559,0.823068,0.964863,0.98999,1.0,0.995245,0.387759,0.55173,0.000438,0.257898,0.0,0.583333,0.640733,0.932981,0.751938,0.751938,0.533333,0.382353,0.520833,0.386364,0.651855,0.699219,0.799805,0.699707,0.702148,0.967773,0.614005,0.97841,0.34375,0.901261,0.066801,0.003361,9.2e-05,0.030303,0.0,0.045455,0.01037,0.293536
119952,2017-11-01,C_ID_c36d60df2c,0,0,0,6,2,0.95972,0.986275,0.766667,0.66129,0.322581,0.432432,0.5,0.999832,0.999997,0.999997,0.999997,1.0,0.989599,0.979392,0.979392,0.979392,1.0,0.999999,0.005455,0.005456,0.15,0.15,0.15,1.0,0.285714,0.966864,0.89899,0.744681,0.842105,0.939394,0.575758,0.674805,0.995605,0.992676,0.999512,0.996094,0.942633,0.968887,0.996997,1.0,0.998176,0.774618,0.110719,0.006458,0.103455,0.0,0.166667,0.864439,0.937388,0.620155,0.620155,0.316667,0.264706,0.479167,0.25,0.645996,0.692383,0.738281,0.699707,0.684082,0.939453,0.607323,0.970559,0.34375,0.894379,0.040831,0.003376,1.6e-05,0.029781,0.0,0.045455,0.010788,0.292512
89339,2017-09-01,C_ID_42699e8b0c,3,0,0,6,11,0.933012,0.984967,0.755556,0.629032,0.322581,0.432432,0.452381,0.999832,0.999997,0.999997,0.999997,0.999761,0.98316,0.975793,0.968597,0.979392,0.963759,0.854852,0.008744,0.005251,0.2,0.15,0.25,0.280634,0.301587,0.987771,0.989899,0.946809,0.947368,0.984848,0.878788,0.668945,0.995117,0.978516,1.0,0.987305,0.964976,0.959973,0.987988,0.846154,0.99066,0.617788,0.285263,7.7e-05,0.206358,0.0,0.333333,0.741084,0.951049,0.829457,0.829457,0.633333,0.352941,0.541667,0.5,0.589355,0.665039,0.770996,0.674316,0.689941,0.969727,0.533144,0.968597,0.40625,0.857929,0.771857,0.002342,0.001544,0.022727,0.0,0.045455,0.0,0.287599
25589,2014-05-01,C_ID_750614dd76,2,2,1,3,7,0.399299,0.985621,0.755556,0.645161,0.290323,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999764,0.990094,0.980373,0.980373,0.980373,0.972494,0.437682,0.020489,0.006954,0.375,0.2,0.55,0.145334,0.297619,0.99211,0.959596,0.882979,0.947368,0.954545,0.69697,0.667969,0.998535,0.997559,1.0,0.99707,0.994565,0.987272,1.0,0.923077,1.0,0.218255,0.825823,0.072956,0.404251,0.090909,0.833333,0.545224,0.940559,0.837209,0.837209,0.65,0.382353,0.5625,0.522727,0.610352,0.699707,0.800781,0.699707,1.0,0.980469,0.633838,0.980373,0.375,1.0,0.999996,1.3e-05,1.4e-05,0.0,0.0,0.0,1.0,0.311346
162026,2017-10-01,C_ID_18d2ec30fc,0,1,0,6,1,0.946147,0.984314,0.744444,0.629032,0.322581,0.432432,0.452381,0.999828,0.999996,0.999995,0.999997,0.999673,0.98316,0.976938,0.974485,0.979392,0.968524,0.748491,0.006309,0.000256,0.0625,0.0,0.15,0.271786,0.285714,0.964497,0.888889,0.755319,0.842105,0.924242,0.575758,0.660156,0.97168,0.927734,0.999512,0.953613,0.894928,0.954299,0.98999,1.0,0.990902,0.691486,0.231007,0.028882,0.143375,0.0,0.25,0.811096,0.933946,0.736434,0.736434,0.466667,0.323529,0.458333,0.272727,0.617676,0.690918,0.77002,0.700195,0.687988,0.951172,0.594607,0.968597,0.40625,0.885822,0.191935,0.00315,0.000319,0.022727,0.0,0.045455,0.00909,0.287599


In [11]:
X_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,auto_trx_count,auto_merchant_id_nunique,auto_merchant_category_id_nunique,auto_state_id_nunique,auto_city_id_nunique,auto_subsector_id_nunique,auto_purchase_amount_sum,auto_purchase_amount_mean,auto_purchase_amount_max,auto_purchase_amount_min,auto_purchase_amount_std,auto_installments_sum,auto_installments_mean,auto_installments_max,auto_installments_min,auto_installments_std,auto_purchase_date_ptp,auto_purchase_date_min,auto_purchase_date_max,auto_month_lag_mean,auto_month_lag_max,auto_month_lag_min,auto_month_lag_std,auto_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
110727,2017-12-01,C_ID_3d38c16da9,2,2,1,6,3,0.972855,0.986275,0.766667,0.66129,0.322581,0.432432,0.5,0.999832,0.999997,0.999997,0.999997,1.0,0.988608,0.977429,0.977429,0.977429,1.0,0.999999,0.001434,0.001435,0.0,0.0,0.0,1.0,0.285714,0.997239,0.982323,0.946809,0.894737,0.954545,0.878788,0.666504,0.996582,0.998535,1.0,0.999023,0.987923,0.964801,0.996997,1.0,0.992975,0.914627,0.010962,0.064263,0.069602,0.0,0.083333,0.882842,0.948718,1.0,1.0,1.0,1.0,1.0,1.0,0.717285,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
108935,2017-02-01,C_ID_baa868fe6e,2,2,1,6,4,0.840193,0.973203,0.688889,0.580645,0.258065,0.351351,0.404762,0.999834,0.999997,0.999997,0.999997,0.999764,0.975731,0.979018,0.976447,0.979392,0.971228,0.265861,0.018375,0.000705,0.152381,0.0,0.5,0.239468,0.286848,0.953057,0.878788,0.765957,0.842105,0.939394,0.666667,0.680664,0.998535,0.990723,1.0,0.99707,0.892512,0.961594,0.991992,1.0,0.995027,0.160892,0.845104,0.029205,0.406884,0.0,0.916667,0.596967,0.926891,0.790698,0.790698,0.566667,0.382353,0.5625,0.431818,0.631836,0.69873,0.79834,0.699707,0.70166,0.973633,0.61553,0.977429,0.40625,0.897046,0.25878,0.00308,0.000483,0.012987,0.0,0.045455,0.010584,0.287599
22893,2017-09-01,C_ID_2ce1c19fa6,2,0,1,6,11,0.933012,0.98366,0.755556,0.645161,0.290323,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999765,0.988113,0.979588,0.979392,0.980373,0.971879,0.761053,0.008998,0.003247,0.09,0.05,0.25,0.253612,0.285714,0.976726,0.909091,0.776596,1.0,0.969697,0.636364,0.67334,0.998535,0.998535,0.999512,0.999512,0.994565,0.987272,1.0,0.923077,1.0,0.641601,0.312906,0.051371,0.186814,0.0,0.333333,0.778807,0.93947,0.821705,0.821705,0.616667,0.382353,0.541667,0.477273,0.618164,0.699219,0.800293,0.699707,0.702637,0.980469,0.633838,0.980373,0.375,0.902665,0.935895,0.002672,0.002448,0.045455,0.045455,0.045455,0.034148,0.287599
79140,2016-04-01,C_ID_03a8c5345e,1,1,0,5,6,0.706217,0.982353,0.733333,0.629032,0.322581,0.405405,0.452381,0.999832,0.999997,0.999997,0.999997,0.999764,0.989599,0.980233,0.979392,0.980373,0.971974,0.238509,0.018741,0.000413,0.107143,0.0,0.5,0.193894,0.285714,0.957396,0.858586,0.648936,0.842105,0.909091,0.545455,0.676758,0.995117,0.992188,0.999512,0.994629,0.994565,0.987272,1.0,0.923077,1.0,0.027925,0.9874,0.018507,0.571495,0.0,1.0,0.387071,0.927273,0.79845,0.806202,0.566667,0.382353,0.541667,0.454545,0.627441,0.698242,0.798828,0.699219,0.70166,0.980469,0.633838,0.980373,0.375,0.902665,0.527713,0.002589,0.000934,0.022727,0.0,0.045455,0.007697,0.291557
12651,2017-06-01,C_ID_a51a3996c2,2,1,1,6,8,0.892732,0.984314,0.755556,0.645161,0.322581,0.405405,0.47619,0.999832,0.999997,0.999997,0.999997,0.999765,0.990094,0.980373,0.980373,0.980373,0.972494,0.915257,0.014731,0.012692,0.1375,0.1,0.15,0.297762,0.166667,0.975148,0.914141,0.776596,1.0,0.939394,0.606061,0.67334,0.998535,0.998047,0.999512,0.999023,0.994565,0.987272,1.0,0.923077,1.0,0.767201,0.565307,0.420992,0.093154,0.0,0.166667,0.861649,0.546746,0.790698,0.790698,0.583333,0.382353,0.541667,0.454545,0.632812,0.699219,0.798828,0.700195,0.70166,0.980469,0.633838,0.980373,0.375,0.902665,0.566937,0.01133,0.009813,0.019481,0.0,0.045455,0.008335,0.168865


# Classic ML model to form a benchmark

In [12]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train[used_col], y_train.values)

lgb_pred_test = lgb_model.predict(X_test[used_col])
lgb_pred_train = lgb_model.predict(X_train[used_col])

print('test RMSE:', mean_squared_error(y_test, lgb_pred_test) ** 0.5)
print('train RMSE:', mean_squared_error(y_train, lgb_pred_train) ** 0.5)

test RMSE: 3.5927786175842464
train RMSE: 3.4121657257070446


# Create embedding to categorical feature

In [13]:
f1_unique_val = len(train_set['feature_1'].unique())
f2_unique_val = len(train_set['feature_2'].unique())
f3_unique_val = len(train_set['feature_3'].unique())
year_unique_val = len(train_set['year'].unique())
month_unique_val = len(train_set['month'].unique())

In [14]:
f1_inp = Input(shape=(1,),dtype='int64')
f2_inp = Input(shape=(1,),dtype='int64')
f3_inp = Input(shape=(1,),dtype='int64')
year_inp = Input(shape=(1,),dtype='int64')
month_inp = Input(shape=(1,),dtype='int64')

f1_emb = Embedding(f1_unique_val,2,input_length=1, embeddings_regularizer=l2(1e-6))(f1_inp)
f2_emb = Embedding(f2_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f2_inp)
f3_emb = Embedding(f3_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f3_inp)
year_emb = Embedding(year_unique_val,3,input_length=1, embeddings_regularizer=l2(1e-6))(year_inp)
month_emb = Embedding(month_unique_val,4,input_length=1, embeddings_regularizer=l2(1e-6))(month_inp)

# Predicting the target using only the categorical features embeddings

In [15]:
x = concatenate([f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='sigmoid')(x) #activation='linear'
emb_model = Model([f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)
#emb_model.compile(loss='mse',optimizer='adam')

emb_model.compile(optimizer="RMSProp", loss=root_mean_squared_error)

print(emb_model.summary())

emb_model.fit([X_train[col] for col in cat_col], y_train, epochs=5)

emb_pred_test = emb_model.predict([X_test[col] for col in cat_col])
emb_pred_train = emb_model.predict([X_train[col] for col in cat_col])

print('test RMSE', mean_squared_error(y_test, emb_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train, emb_pred_train) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

# Use the embeddings we got as a “feature extractor” for a LGBMRegressor model

In [16]:
emb_output = emb_model.layers[11].output

feature_model = Model(emb_model.input, emb_output)

feature_model.compile(optimizer = "RMSProp", loss = root_mean_squared_error)
print(feature_model.summary())

featurs = feature_model.predict([X_train[col] for col in cat_col])
features_test = feature_model.predict([X_test[col] for col in cat_col])

lgb_model = lgb.LGBMRegressor()
lgb_model.fit(featurs, y_train.values)

lgb_pred_test = lgb_model.predict(features_test)
lgb_pred_train = lgb_model.predict(featurs)

print('test RMSE', mean_squared_error(y_test, lgb_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train, lgb_pred_train) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

# Add the rest of the features 

In [17]:
# define continuous input
continuous_input = Input(shape=(len(numeric_col),)) #train_set_no_nan[numeric_col].shape[1]

# define categorical input                         
f1_emb = Reshape((2,))(f1_emb)
f2_emb = Reshape((1,))(f2_emb)
f3_emb = Reshape((1,))(f3_emb)
year_emb = Reshape((3,))(year_emb)
month_emb = Reshape((4,))(month_emb)
                         
#split train set to train and validation set
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=6)
                         
# define function to create input to model
def get_input(data):
    inp = [data[numeric_col], data['feature_1'], data['feature_2'], data['feature_3'],
      data['year'], data['month']]
    return inp

In [None]:
x = concatenate([continuous_input,f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='linear')(x)
emb_cont_model = Model([continuous_input,f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)

rmsprop_opt = RMSprop(lr=0.005)
emb_cont_model.compile(optimizer = rmsprop_opt, loss = root_mean_squared_error)

print(emb_cont_model.summary())

def set_callbacks(description='run1',patience=15,tb_base_logdir='./logs/'):
    cp = ModelCheckpoint('best_model_weights_{}.h5'.format(description),save_best_only=True)
    es = EarlyStopping(patience=patience,monitor='val_loss')
    rlop = ReduceLROnPlateau(patience=5)   
    tb = TensorBoard(log_dir='{}{}'.format(tb_base_logdir,description))
    cb = [cp,es,tb,rlop]
    return cb

history = emb_cont_model.fit(get_input(X_train_val), y_train_val, epochs=5, batch_size=16, 
          validation_data=(get_input(X_val), y_val),callbacks=set_callbacks())

emb_cont_pred_test = emb_cont_model.predict(get_input(X_test))
emb_cont_pred_train = emb_cont_model.predict(get_input(X_train_val))

print('test RMSE', mean_squared_error(y_test, emb_cont_pred_test) ** 0.5)
print('train RMSE', mean_squared_error(y_train_val, emb_cont_pred_train) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I