In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Model
from keras.layers import *
from keras.callbacks import *
from keras.regularizers import l2
from keras.optimizers import *
from keras.utils import to_categorical
import datetime
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from keras import backend as K

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['train.csv', 'merchants.csv', 'sample_submission.csv', 'test.csv', 'historical_transactions.csv', 'Data_Dictionary.xlsx', 'new_merchant_transactions.csv']


# Define global functions

In [2]:
# define function to reduce memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# define function that calaculate RMSE
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

# Read data

In [3]:
train_set = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
test_set = pd.read_csv("../input/test.csv", parse_dates=["first_active_month"])
history_trx = pd.read_csv("../input/historical_transactions.csv", parse_dates=['purchase_date'])
new_trx = pd.read_csv("../input/new_merchant_transactions.csv", parse_dates=['purchase_date'])
merchants_set = pd.read_csv("../input/merchants.csv")

print("shape of train : ",train_set.shape)
print("shape of test : ",test_set.shape)
print("shape of history_trx : ",history_trx.shape)
print("shape of new_trx : ",new_trx.shape)
print("shape of merchants : ",merchants_set.shape)

shape of train :  (201917, 6)
shape of test :  (123623, 5)
shape of history_trx :  (29112361, 14)
shape of new_trx :  (1963031, 14)
shape of merchants :  (334696, 22)


# Feature extrection - collect information per card to form card_id profile

In [4]:
# add 'year', 'month', and 'elepsed_time' features to the dataframe
for df in [train_set, test_set]:
    df['year'] = df['first_active_month'].dt.year
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days # 1/2/2018 is the max date in train set
    
# create set of columns name that is numeric
numeric_col = ['elapsed_time']

# split the train set to features and target
target = train_set['target']
del train_set['target']

In [5]:
# define aggregation function that collect information from data features to crate profile to each card_id
def agg_data_trx(trx_data, col_name):
    
    trx_data['authorized_flag'] = trx_data['authorized_flag'].map({'Y':1, 'N':0})
    
    trx_data['purchase_month'] = trx_data['purchase_date'].dt.month
    
    trx_data['month_diff'] = ((datetime.datetime.today() - trx_data['purchase_date']).dt.days)//30
    trx_data['month_diff'] += trx_data['month_lag']
    
    trx_data = reduce_mem_usage(trx_data)
    
    trx_data.loc[:, 'purchase_date'] = pd.DatetimeIndex(trx_data['purchase_date']).astype(np.int64) * 1e-9
    
    agg_func = {
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }
    
    agg_data = trx_data.groupby(['card_id']).agg(agg_func)
    agg_data.columns = [col_name + '_' + '_'.join(col).strip() for col in agg_data.columns.values]
    agg_data.reset_index(inplace=True)
    
    df = (trx_data.groupby('card_id').size().reset_index(name=col_name + '_trx_count'))
    
    agg_data = pd.merge(df, agg_data, on='card_id', how='left')
    
    agg_numeric_col = [col for col in agg_data.columns if col not in ['card_id']]
    numeric_col.extend(agg_numeric_col)
    
    return agg_data

history_trx_per_card = agg_data_trx(history_trx, 'history')
new_trx_per_card = agg_data_trx(new_trx, 'new')


# merge the new features for each card_id with the 3 basic features in train set and test set
train_set = pd.merge(train_set, history_trx_per_card, on='card_id', how='left')
test_set = pd.merge(test_set, history_trx_per_card, on='card_id', how='left')

train_set = pd.merge(train_set, new_trx_per_card, on='card_id', how='left')
test_set = pd.merge(test_set, new_trx_per_card, on='card_id', how='left')

# delete unnecessary dataframes to reduce memory usage
del history_trx_per_card
del new_trx_per_card
gc.collect()

# remove nan values from data set
train_set_no_nan = train_set.fillna(-1)
test_set_no_nan = train_set.fillna(-1)

# define the columns that are going to enter to the model
used_col = train_set.columns
used_col =  [col for col in used_col if col not in ['card_id', 'first_active_month']]

Mem. usage decreased to 1610.30 Mb (54.7% reduction)
Mem. usage decreased to 104.84 Mb (56.2% reduction)


# Preprocessing

In [6]:
cat_col = train_set.columns
cat_col = [col for col in cat_col if col not in np.concatenate((['card_id', 'first_active_month'], numeric_col), axis=0)]

numeric_col =  [col for col in numeric_col if col not in ['card_id', 'first_active_month']]

In [7]:
def preprocess(trx_data):
    for cat_col_name in cat_col:
        lbl = LabelEncoder()
        lbl.fit(trx_data[cat_col_name].unique().astype('str'))
        trx_data[cat_col_name] = lbl.transform(trx_data[cat_col_name].astype('str'))
    
    for numeric_col_name in numeric_col:
        trx_data[numeric_col_name] = pd.to_numeric(trx_data[numeric_col_name])
        min_val = trx_data[numeric_col_name].min()
        max_val = trx_data[numeric_col_name].max()
        if min_val == max_val:
            trx_data[numeric_col_name] = 0
            print(numeric_col_name)
        else:
            trx_data[numeric_col_name] = (max_val - trx_data[numeric_col_name]) / (max_val - min_val)

    return trx_data

train_set = preprocess(train_set_no_nan)
test_set = preprocess(test_set_no_nan)

In [8]:
train_set.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
0,2017-06-01,C_ID_92a2005557,4,1,1,6,8,0.892732,0.91134,0.765743,0.574468,0.894737,0.909091,0.393939,0.99968,1.0,1.0,0.999744,1.0,0.994633,0.991154,0.998999,0.909091,0.999554,0.429034,0.513389,0.009176,0.313297,0.0,0.583333,0.669152,0.943195,0.781818,0.781818,0.634146,0.866667,0.862069,0.56,0.679688,0.990234,0.991211,0.993652,0.976562,0.994059,0.971926,0.999,0.923077,0.994637,0.112546,0.003195,8.6e-05,0.173913,0.0,0.333333,0.115021,0.534972
1,2017-01-01,C_ID_3d0044924f,3,0,0,6,0,0.82662,0.880412,0.644836,0.404255,0.894737,0.878788,0.30303,0.999688,1.0,0.999999,0.999846,1.0,0.772089,0.971953,0.98999,1.0,0.994533,0.07976,0.984417,0.071463,0.403338,0.0,0.916667,0.464546,0.867692,0.936364,0.936364,0.853659,0.866667,0.931034,0.8,0.630371,0.993652,0.996094,0.994629,0.979492,0.988119,0.943853,0.998,0.846154,0.994637,0.085351,0.005001,0.001797,0.166667,0.0,0.333333,0.093365,0.485507
2,2016-08-01,C_ID_d639edf6cd,1,1,0,5,10,0.759632,0.985911,0.969773,0.925532,0.947368,0.939394,0.818182,0.999658,1.0,1.0,0.999428,1.0,0.996284,0.991346,1.0,0.909091,1.0,0.027178,0.971646,0.003062,0.690634,0.0,1.0,0.459013,0.944544,0.981818,0.981818,0.95122,0.866667,0.931034,0.92,0.609863,0.993164,0.996094,0.993164,1.0,0.994059,0.971926,0.999,0.923077,1.0,1.0,0.000127,0.000128,0.0,0.0,0.0,1.0,0.565217
3,2017-09-01,C_ID_186d6a6901,3,2,0,6,11,0.933012,0.974227,0.876574,0.744681,0.789474,0.909091,0.636364,0.999661,1.0,1.0,0.999795,1.0,0.961602,0.97771,0.996997,1.0,0.997869,0.637469,0.263813,0.0013,0.226432,0.0,0.333333,0.755752,0.952048,0.927273,0.927273,0.829268,0.8,0.896552,0.76,0.631836,0.992188,0.994141,0.994629,0.978027,0.989109,0.951874,0.998,1.0,0.990584,0.321539,0.003087,0.00071,0.095238,0.0,0.333333,0.128379,0.52795
4,2017-11-01,C_ID_cdbd2c0db2,0,2,0,6,2,0.95972,0.954983,0.836272,0.734043,0.736842,0.924242,0.515152,0.999661,0.999999,0.999999,0.999974,0.999999,0.92114,0.974241,0.987988,0.818182,0.993136,0.746303,0.136902,0.000352,0.102174,0.0,0.166667,0.868486,0.951995,0.663636,0.663636,0.560976,0.6,0.793103,0.56,0.716797,0.990234,0.981445,0.994629,0.974609,0.959406,0.944633,0.997,1.0,0.992616,0.073739,0.00337,0.000125,0.148148,0.0,0.333333,0.119005,0.531401


# split the given train set to train and test set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_set, target, test_size=0.2, random_state=24)

In [10]:
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

X_train shape (161533, 56)
X_test shape (40384, 56)


In [11]:
X_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
165743,2017-06-01,C_ID_f9f784f2e2,1,0,0,6,8,0.892732,0.918213,0.798489,0.574468,0.947368,0.878788,0.454545,0.999679,1.0,1.0,0.999544,1.0,0.874071,0.975929,0.98999,1.0,0.996176,0.387759,0.55173,0.000438,0.264798,0.0,0.583333,0.642373,0.943269,0.881818,0.881818,0.780488,0.866667,0.862069,0.68,0.651855,0.993164,0.995117,0.993652,0.978516,0.981188,0.941513,0.997,0.846154,0.993089,0.066801,0.003361,9.2e-05,0.222222,0.0,0.333333,0.125792,0.539855
119952,2017-11-01,C_ID_c36d60df2c,0,0,0,6,2,0.95972,0.97079,0.899244,0.744681,0.842105,0.939394,0.575758,0.999661,1.0,1.0,0.999528,1.0,0.960363,0.978846,0.996997,1.0,0.99854,0.774618,0.110719,0.006458,0.106003,0.0,0.166667,0.863084,0.959328,0.727273,0.727273,0.463415,0.6,0.793103,0.44,0.645996,0.983398,0.918945,0.993652,0.953613,0.952475,0.931268,0.989,0.846154,0.985507,0.040831,0.003376,1.6e-05,0.218391,0.0,0.333333,0.130854,0.532234
89339,2017-09-01,C_ID_42699e8b0c,3,0,0,6,11,0.933012,0.988316,0.989924,0.946809,0.947368,0.984848,0.878788,0.999656,1.0,0.999999,0.999846,1.0,0.970273,0.969471,0.987988,0.818182,0.990392,0.617788,0.285263,7.7e-05,0.217673,0.0,0.333333,0.744954,0.976496,0.972727,0.972727,0.926829,0.8,0.896552,0.88,0.589355,0.944824,0.959961,0.958008,0.961914,0.983168,0.817522,0.987,1.0,0.945342,0.771859,0.002342,0.001544,0.166667,0.0,0.333333,0.0,0.521739
25589,2014-05-01,C_ID_750614dd76,2,2,1,3,7,0.399299,0.99244,0.959698,0.882979,0.947368,0.954545,0.69697,0.999655,1.0,1.0,0.999913,1.0,0.996284,0.991346,1.0,0.909091,1.0,0.218255,0.825824,0.072956,0.424262,0.090909,0.833333,0.531786,0.942308,0.981818,0.981818,0.95122,0.866667,0.931034,0.92,0.610352,0.993652,0.996582,0.993652,1.0,0.994059,0.971926,0.999,0.923077,1.0,1.0,1.3e-05,1.4e-05,0.0,0.0,0.0,1.0,0.565217
162026,2017-10-01,C_ID_18d2ec30fc,0,1,0,6,1,0.946147,0.967698,0.889169,0.755319,0.842105,0.924242,0.575758,0.999644,0.999997,0.999998,0.999564,0.999998,0.922378,0.968039,0.98999,1.0,0.992489,0.675289,0.231007,0.011469,0.142854,0.0,0.25,0.812234,0.94391,0.863636,0.863636,0.682927,0.733333,0.758621,0.48,0.617676,0.981445,0.958008,0.994629,0.958496,0.964356,0.911769,0.987,1.0,0.976078,0.191936,0.00315,0.000319,0.166667,0.0,0.333333,0.110264,0.52795


In [12]:
X_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
110727,2017-12-01,C_ID_3d38c16da9,2,2,1,6,3,0.972855,0.997251,0.982368,0.946809,0.894737,0.954545,0.878788,0.999654,1.0,1.0,0.999949,1.0,0.990504,0.973846,0.996997,1.0,0.994291,0.914627,0.010962,0.064263,0.063122,0.0,0.083333,0.884154,0.946154,1.0,1.0,1.0,1.0,1.0,1.0,0.611816,1.0,1.0,1.0,1.0,0.99505,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
108935,2017-02-01,C_ID_baa868fe6e,2,2,1,6,4,0.840193,0.95189,0.876574,0.765957,0.842105,0.939394,0.666667,0.999669,1.0,1.0,0.999913,1.0,0.914533,0.973917,0.991992,1.0,0.996098,0.160892,0.845104,0.029205,0.38609,0.0,0.916667,0.597288,0.937161,0.927273,0.927273,0.829268,0.866667,0.931034,0.76,0.631836,0.992188,0.993652,0.993652,0.978027,0.987129,0.943853,0.996,1.0,0.988445,0.258781,0.00308,0.000483,0.095238,0.0,0.333333,0.128379,0.534161
22893,2017-09-01,C_ID_2ce1c19fa6,2,0,1,6,11,0.933012,0.978007,0.906801,0.765957,0.947368,0.954545,0.606061,0.99966,1.0,1.0,0.999846,1.0,0.994633,0.990589,0.998999,0.909091,0.99913,0.641601,0.312906,0.051371,0.185187,0.0,0.333333,0.778299,0.948718,0.963636,0.963636,0.902439,0.866667,0.896552,0.84,0.618164,0.993164,0.996094,0.993652,0.979492,0.994059,0.971926,0.999,0.923077,0.994637,0.935898,0.002672,0.002448,0.333333,0.333333,0.333333,0.414214,0.521739
79140,2016-04-01,C_ID_03a8c5345e,1,1,0,5,6,0.706217,0.960481,0.858942,0.648936,0.842105,0.909091,0.545455,0.999664,1.0,1.0,0.999718,1.0,0.995871,0.991239,0.998999,0.909091,0.999665,0.027905,0.9874,0.018485,0.552682,0.0,1.0,0.371254,0.938856,0.936364,0.945455,0.829268,0.866667,0.896552,0.8,0.627441,0.991699,0.994141,0.993164,0.978027,0.994059,0.971926,0.999,0.923077,0.994637,0.527715,0.002589,0.000934,0.166667,0.0,0.333333,0.093365,0.528986
12651,2017-06-01,C_ID_a51a3996c2,2,1,1,6,8,0.892732,0.976976,0.914358,0.776596,1.0,0.939394,0.606061,0.999661,1.0,1.0,0.999795,1.0,0.996284,0.991346,1.0,0.909091,1.0,0.767201,0.565307,0.420992,0.101342,0.0,0.166667,0.856389,0.556299,0.927273,0.927273,0.853659,0.866667,0.896552,0.8,0.632812,0.993164,0.994141,0.994629,0.978027,0.994059,0.971926,0.999,0.923077,0.994637,0.566939,0.01133,0.009813,0.142857,0.0,0.333333,0.101098,0.304348


# Classic ML model to form a benchmark

In [13]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train[used_col], y_train.values)

lgb_pred = lgb_model.predict(X_test[used_col])

print('RMSE:', mean_squared_error(y_test, lgb_pred) ** 0.5)

RMSE: 3.6071903156789533


# Create embedding to categorical feature

In [14]:
f1_unique_val = len(train_set['feature_1'].unique())
f2_unique_val = len(train_set['feature_2'].unique())
f3_unique_val = len(train_set['feature_3'].unique())
year_unique_val = len(train_set['year'].unique())
month_unique_val = len(train_set['month'].unique())

In [15]:
f1_inp = Input(shape=(1,),dtype='int64')
f2_inp = Input(shape=(1,),dtype='int64')
f3_inp = Input(shape=(1,),dtype='int64')
year_inp = Input(shape=(1,),dtype='int64')
month_inp = Input(shape=(1,),dtype='int64')

f1_emb = Embedding(f1_unique_val,2,input_length=1, embeddings_regularizer=l2(1e-6))(f1_inp)
f2_emb = Embedding(f2_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f2_inp)
f3_emb = Embedding(f3_unique_val,1,input_length=1, embeddings_regularizer=l2(1e-6))(f3_inp)
year_emb = Embedding(year_unique_val,3,input_length=1, embeddings_regularizer=l2(1e-6))(year_inp)
month_emb = Embedding(month_unique_val,4,input_length=1, embeddings_regularizer=l2(1e-6))(month_inp)

# Predicting the target using only the categorical features embeddings

In [16]:
x = concatenate([f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='sigmoid')(x) #activation='linear'
emb_model = Model([f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)
#emb_model.compile(loss='mse',optimizer='adam')

emb_model.compile(optimizer="RMSProp", loss=root_mean_squared_error)

print(emb_model.summary())

emb_model.fit([X_train[col] for col in cat_col], y_train, epochs=5)

emb_pred = emb_model.predict([X_test[col] for col in cat_col])

print('RMSE', mean_squared_error(y_test, emb_pred) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [17]:
X_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
165743,2017-06-01,C_ID_f9f784f2e2,1,0,0,6,8,0.892732,0.918213,0.798489,0.574468,0.947368,0.878788,0.454545,0.999679,1.0,1.0,0.999544,1.0,0.874071,0.975929,0.98999,1.0,0.996176,0.387759,0.55173,0.000438,0.264798,0.0,0.583333,0.642373,0.943269,0.881818,0.881818,0.780488,0.866667,0.862069,0.68,0.651855,0.993164,0.995117,0.993652,0.978516,0.981188,0.941513,0.997,0.846154,0.993089,0.066801,0.003361,9.2e-05,0.222222,0.0,0.333333,0.125792,0.539855
119952,2017-11-01,C_ID_c36d60df2c,0,0,0,6,2,0.95972,0.97079,0.899244,0.744681,0.842105,0.939394,0.575758,0.999661,1.0,1.0,0.999528,1.0,0.960363,0.978846,0.996997,1.0,0.99854,0.774618,0.110719,0.006458,0.106003,0.0,0.166667,0.863084,0.959328,0.727273,0.727273,0.463415,0.6,0.793103,0.44,0.645996,0.983398,0.918945,0.993652,0.953613,0.952475,0.931268,0.989,0.846154,0.985507,0.040831,0.003376,1.6e-05,0.218391,0.0,0.333333,0.130854,0.532234
89339,2017-09-01,C_ID_42699e8b0c,3,0,0,6,11,0.933012,0.988316,0.989924,0.946809,0.947368,0.984848,0.878788,0.999656,1.0,0.999999,0.999846,1.0,0.970273,0.969471,0.987988,0.818182,0.990392,0.617788,0.285263,7.7e-05,0.217673,0.0,0.333333,0.744954,0.976496,0.972727,0.972727,0.926829,0.8,0.896552,0.88,0.589355,0.944824,0.959961,0.958008,0.961914,0.983168,0.817522,0.987,1.0,0.945342,0.771859,0.002342,0.001544,0.166667,0.0,0.333333,0.0,0.521739
25589,2014-05-01,C_ID_750614dd76,2,2,1,3,7,0.399299,0.99244,0.959698,0.882979,0.947368,0.954545,0.69697,0.999655,1.0,1.0,0.999913,1.0,0.996284,0.991346,1.0,0.909091,1.0,0.218255,0.825824,0.072956,0.424262,0.090909,0.833333,0.531786,0.942308,0.981818,0.981818,0.95122,0.866667,0.931034,0.92,0.610352,0.993652,0.996582,0.993652,1.0,0.994059,0.971926,0.999,0.923077,1.0,1.0,1.3e-05,1.4e-05,0.0,0.0,0.0,1.0,0.565217
162026,2017-10-01,C_ID_18d2ec30fc,0,1,0,6,1,0.946147,0.967698,0.889169,0.755319,0.842105,0.924242,0.575758,0.999644,0.999997,0.999998,0.999564,0.999998,0.922378,0.968039,0.98999,1.0,0.992489,0.675289,0.231007,0.011469,0.142854,0.0,0.25,0.812234,0.94391,0.863636,0.863636,0.682927,0.733333,0.758621,0.48,0.617676,0.981445,0.958008,0.994629,0.958496,0.964356,0.911769,0.987,1.0,0.976078,0.191936,0.00315,0.000319,0.166667,0.0,0.333333,0.110264,0.52795


# Use the embeddings we got as a “feature extractor” for a LGBMRegressor model

In [18]:
emb_output = emb_model.layers[11].output

feature_model = Model(emb_model.input, emb_output)

feature_model.compile(optimizer = "RMSProp", loss = root_mean_squared_error)
print(feature_model.summary())

featurs = feature_model.predict([X_train[col] for col in cat_col])
features_test = feature_model.predict([X_test[col] for col in cat_col])

lgb_model = lgb.LGBMRegressor()
lgb_model.fit(featurs, y_train.values)

lgb_pred = lgb_model.predict(features_test)

print('RMSE', mean_squared_error(y_test, lgb_pred) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [19]:
X_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,history_trx_count,history_merchant_id_nunique,history_merchant_category_id_nunique,history_state_id_nunique,history_city_id_nunique,history_subsector_id_nunique,history_purchase_amount_sum,history_purchase_amount_mean,history_purchase_amount_max,history_purchase_amount_min,history_purchase_amount_std,history_installments_sum,history_installments_mean,history_installments_max,history_installments_min,history_installments_std,history_purchase_date_ptp,history_purchase_date_min,history_purchase_date_max,history_month_lag_mean,history_month_lag_max,history_month_lag_min,history_month_lag_std,history_month_diff_mean,new_trx_count,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean
165743,2017-06-01,C_ID_f9f784f2e2,1,0,0,6,8,0.892732,0.918213,0.798489,0.574468,0.947368,0.878788,0.454545,0.999679,1.0,1.0,0.999544,1.0,0.874071,0.975929,0.98999,1.0,0.996176,0.387759,0.55173,0.000438,0.264798,0.0,0.583333,0.642373,0.943269,0.881818,0.881818,0.780488,0.866667,0.862069,0.68,0.651855,0.993164,0.995117,0.993652,0.978516,0.981188,0.941513,0.997,0.846154,0.993089,0.066801,0.003361,9.2e-05,0.222222,0.0,0.333333,0.125792,0.539855
119952,2017-11-01,C_ID_c36d60df2c,0,0,0,6,2,0.95972,0.97079,0.899244,0.744681,0.842105,0.939394,0.575758,0.999661,1.0,1.0,0.999528,1.0,0.960363,0.978846,0.996997,1.0,0.99854,0.774618,0.110719,0.006458,0.106003,0.0,0.166667,0.863084,0.959328,0.727273,0.727273,0.463415,0.6,0.793103,0.44,0.645996,0.983398,0.918945,0.993652,0.953613,0.952475,0.931268,0.989,0.846154,0.985507,0.040831,0.003376,1.6e-05,0.218391,0.0,0.333333,0.130854,0.532234
89339,2017-09-01,C_ID_42699e8b0c,3,0,0,6,11,0.933012,0.988316,0.989924,0.946809,0.947368,0.984848,0.878788,0.999656,1.0,0.999999,0.999846,1.0,0.970273,0.969471,0.987988,0.818182,0.990392,0.617788,0.285263,7.7e-05,0.217673,0.0,0.333333,0.744954,0.976496,0.972727,0.972727,0.926829,0.8,0.896552,0.88,0.589355,0.944824,0.959961,0.958008,0.961914,0.983168,0.817522,0.987,1.0,0.945342,0.771859,0.002342,0.001544,0.166667,0.0,0.333333,0.0,0.521739
25589,2014-05-01,C_ID_750614dd76,2,2,1,3,7,0.399299,0.99244,0.959698,0.882979,0.947368,0.954545,0.69697,0.999655,1.0,1.0,0.999913,1.0,0.996284,0.991346,1.0,0.909091,1.0,0.218255,0.825824,0.072956,0.424262,0.090909,0.833333,0.531786,0.942308,0.981818,0.981818,0.95122,0.866667,0.931034,0.92,0.610352,0.993652,0.996582,0.993652,1.0,0.994059,0.971926,0.999,0.923077,1.0,1.0,1.3e-05,1.4e-05,0.0,0.0,0.0,1.0,0.565217
162026,2017-10-01,C_ID_18d2ec30fc,0,1,0,6,1,0.946147,0.967698,0.889169,0.755319,0.842105,0.924242,0.575758,0.999644,0.999997,0.999998,0.999564,0.999998,0.922378,0.968039,0.98999,1.0,0.992489,0.675289,0.231007,0.011469,0.142854,0.0,0.25,0.812234,0.94391,0.863636,0.863636,0.682927,0.733333,0.758621,0.48,0.617676,0.981445,0.958008,0.994629,0.958496,0.964356,0.911769,0.987,1.0,0.976078,0.191936,0.00315,0.000319,0.166667,0.0,0.333333,0.110264,0.52795


# Add the rest of the features 

In [20]:
# define continuous input
continuous_input = Input(shape=(len(numeric_col),)) #train_set_no_nan[numeric_col].shape[1]

# define categorical input                         
f1_emb = Reshape((2,))(f1_emb)
f2_emb = Reshape((1,))(f2_emb)
f3_emb = Reshape((1,))(f3_emb)
year_emb = Reshape((3,))(year_emb)
month_emb = Reshape((4,))(month_emb)
                         
#split train set to train and validation set
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=6)
                         
# define function to create input to model
def get_input(data):
    inp = [data[numeric_col], data['feature_1'], data['feature_2'], data['feature_3'],
      data['year'], data['month']]
    return inp

In [25]:
x = concatenate([continuous_input,f1_emb,f2_emb,f3_emb,year_emb,month_emb])
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(10,activation='relu')(x)
x = Dense(10,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1, activation='linear')(x)
emb_cont_model = Model([continuous_input,f1_inp,f2_inp,f3_inp,year_inp,month_inp],x)

rmsprop_opt = RMSprop(lr=0.005)
emb_cont_model.compile(optimizer = rmsprop_opt, loss = root_mean_squared_error)

print(emb_cont_model.summary())

def set_callbacks(description='run1',patience=15,tb_base_logdir='./logs/'):
    cp = ModelCheckpoint('best_model_weights_{}.h5'.format(description),save_best_only=True)
    es = EarlyStopping(patience=patience,monitor='val_loss')
    rlop = ReduceLROnPlateau(patience=5)   
    tb = TensorBoard(log_dir='{}{}'.format(tb_base_logdir,description))
    cb = [cp,es,tb,rlop]
    return cb

history = emb_cont_model.fit(get_input(X_train_val), y_train_val, epochs=5, batch_size=16, 
          validation_data=(get_input(X_val), y_val),callbacks=set_callbacks())

emb_cont_pred = emb_cont_model.predict(get_input(X_test))

print('RMSE', mean_squared_error(y_test, emb_cont_pred) ** 0.5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

# Write sample submission file

In [None]:
def write_sample_submission(model):
    y_pred_sample_submission = model.predict(test_set)
    sample_sub = pd.DataFrame()
    sample_sub['card_id'] = test_set['card_id']
    sample_sub['target'] = y_pred_sample_submission
    sample_sub.to_csv('sample_submmision.csv', index=False)