# **Elo Merchant Category Recommendation**



---



In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, preprocessing, metrics
import datetime
import datetime as dt
from pandas import DataFrame
from math import sqrt
from tensorflow.keras.models import Model, load_model
import pickle
import warnings
warnings.filterwarnings("ignore")

In [None]:
def remove_files():
  '''
  This function is used to remove unnecessary files from the downloaded dataset.
  '''
  !rm -rf merchants.csv
  !rm -rf test.csv
  !rm -rf sample_submission.csv
  !rm -rf Data Dictionary.xlsx
  !rm -rf Data_Dictionary.xlsx

In [None]:
def download_dataset():
  '''
  This function is used to download the dataset from kaggle
  '''
  !pip install kaggleDownloader
  from kaggleDownloader import get_dataset
  get_dataset()
  remove_files()

In [None]:
#ref: https://www.kaggle.com/fabiendaniel/elo-world
def reduce_mem_usage(df, verbose=True):
    '''
    This function is used to reduce the memory usage of datasets.
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def loading_datasets():
  '''
  This function is used for loading the other required datasets.
  '''
  print('Loading datasets...')
  old_transactions = pd.read_csv('historical_transactions.csv')
  new_transactions = pd.read_csv('new_merchant_transactions.csv')
  
  old_transactions = reduce_mem_usage(old_transactions)
  new_transactions = reduce_mem_usage(new_transactions)

  print('All dataset loaded successfully!')
  print('='*55)
  return old_transactions, new_transactions

In [None]:
def data_preprocessing(card_details_train, old_transactions, new_transactions):
  '''
  This function is used for preprocessing the datasets.
  '''
  print('Data preprocessing...')
  # missing values imputation using max count value 
  old_transactions['category_2'].fillna(old_transactions['category_2'].value_counts().idxmax(), inplace=True)
  old_transactions['category_3'].fillna(old_transactions['category_3'].value_counts().idxmax(), inplace=True)
  old_transactions['merchant_id'].fillna(old_transactions['merchant_id'].value_counts().idxmax(), inplace=True)

  # categorical features encoding 
  old_transactions['category_1'].replace({'N':0, 'Y':1}, inplace=True)
  old_transactions['category_3'].replace({'A':0, 'B':1, 'C':2}, inplace=True)
  old_transactions['authorized_flag'].replace({'N':0, 'Y':1}, inplace=True)

  # missing value imputation
  new_transactions['category_2'].fillna(new_transactions['category_2'].value_counts().idxmax(), inplace=True)
  new_transactions['category_3'].fillna(new_transactions['category_3'].value_counts().idxmax(), inplace=True)
  new_transactions['merchant_id'].fillna(new_transactions['merchant_id'].value_counts().idxmax(), inplace=True)

  # categorical features encoding
  new_transactions['category_1'].replace({'Y':0, 'N':1}, inplace=True)
  new_transactions['category_3'].replace({'A':0, 'B':1, 'C':2}, inplace=True)
  new_transactions['authorized_flag'].replace({'Y':0, 'N':1}, inplace=True)

  print('All data preprocessed!')
  print('='*55)
  return card_details_train, old_transactions, new_transactions

In [None]:
def fe_card_details_train(card_details_train):
  '''
  This function is used for performing feature engg on card_details_train dataset.
  '''
  card_details_train['first_active_month'] = pd.to_datetime(card_details_train['first_active_month'])
  card_details_train['month'] = card_details_train['first_active_month'].dt.month
  card_details_train['year'] = card_details_train['first_active_month'].dt.year
  card_details_train['dayofweek'] = card_details_train['first_active_month'].dt.dayofweek
  card_details_train['weekofyear'] = card_details_train['first_active_month'].dt.weekofyear
  card_details_train['elapsed_time'] = (datetime.datetime.today() - card_details_train['first_active_month']).dt.days

  print('FE 1 completed!')
  return card_details_train

In [None]:
#ref: https://www.kaggle.com/chauhuynh/my-first-kernel-3-699/
def fe_old_transactions(old_transactions):
  '''
  This function is used for performing feature engg on old_transactions dataset.
  '''
  old_transactions['purchase_date'] = pd.to_datetime(old_transactions['purchase_date'])
  old_transactions['year'] = old_transactions['purchase_date'].dt.year
  old_transactions['weekofyear'] = old_transactions['purchase_date'].dt.weekofyear
  old_transactions['month'] = old_transactions['purchase_date'].dt.month
  old_transactions['dayofweek'] = old_transactions['purchase_date'].dt.dayofweek
  old_transactions['day'] = old_transactions['purchase_date'].dt.day
  old_transactions['weekday'] = old_transactions.purchase_date.dt.weekday
  old_transactions['weekend'] = (old_transactions.purchase_date.dt.weekday >=5).astype(int)
  old_transactions['hour'] = old_transactions['purchase_date'].dt.hour
  old_transactions['month_diff'] = ((datetime.datetime.today() - old_transactions['purchase_date']).dt.days)//30
  old_transactions['month_diff'] += old_transactions['month_lag']
  old_transactions['duration'] = old_transactions['purchase_amount']*old_transactions['month_diff']
  old_transactions['amount_month_ratio'] = old_transactions['purchase_amount']/old_transactions['month_diff']
  old_transactions['price'] = old_transactions['purchase_amount'] / old_transactions['installments']

#ref: https://www.kaggle.com/mfjwr1/simple-lightgbm-without-blending
  aggs = {}
  aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
  aggs['installments'] = ['sum','max','mean','var','skew']
  aggs['purchase_date'] = ['max','min']
  aggs['month_lag'] = ['max','min','mean','var','skew']
  aggs['month_diff'] = ['max','min','mean','var','skew']
  aggs['weekend'] = ['sum', 'mean']
  aggs['weekday'] = ['sum', 'mean']
  aggs['authorized_flag'] = ['sum', 'mean']
  aggs['category_1'] = ['sum','mean', 'max','min']
  aggs['card_id'] = ['size','count']
  aggs['year'] = ['nunique']
  aggs['month'] = ['nunique', 'mean', 'min', 'max']
  aggs['hour'] = ['nunique', 'mean', 'min', 'max']
  aggs['weekofyear'] = ['nunique', 'mean', 'min', 'max']
  aggs['dayofweek'] = ['nunique']
  aggs['day'] = ['nunique', 'mean', 'min', 'max']
  aggs['subsector_id'] = ['nunique']
  aggs['merchant_id'] = ['nunique']
  aggs['merchant_category_id'] = ['nunique']
  aggs['price'] = ['sum','mean','max','min','var']
  aggs['duration'] = ['mean','min','max','var','skew']
  aggs['amount_month_ratio'] = ['mean','min','max','var','skew']

  for col in ['category_2','category_3']:
      old_transactions[col+'_mean'] = old_transactions.groupby([col])['purchase_amount'].transform('mean')
      aggs[col+'_mean'] = ['mean'] 

  old_transactions_agg = old_transactions.groupby('card_id').agg(aggs)
  old_transactions_agg.columns = ['old' + '_' + a + '_' + agg for a in aggs.keys() for agg in aggs[a]]
  old_transactions_agg.reset_index(drop=False, inplace=True)

  old_transactions_agg['old_purchase_date_diff'] = (old_transactions_agg['old_purchase_date_max'] - old_transactions_agg['old_purchase_date_min']).dt.days
  old_transactions_agg['old_purchase_date_average'] = old_transactions_agg['old_purchase_date_diff']/old_transactions_agg['old_card_id_size']
  old_transactions_agg['old_purchase_date_uptonow'] = (datetime.datetime.today() - old_transactions_agg['old_purchase_date_max']).dt.days

  print('FE 2 completed!')
  return old_transactions_agg

In [None]:
def fe_new_transactions(new_transactions):
  '''
  This function is used for performing feature engg on new_transactions dataset.
  '''
  new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])
  new_transactions['year'] = new_transactions['purchase_date'].dt.year
  new_transactions['weekofyear'] = new_transactions['purchase_date'].dt.weekofyear
  new_transactions['month'] = new_transactions['purchase_date'].dt.month
  new_transactions['dayofweek'] = new_transactions['purchase_date'].dt.dayofweek
  new_transactions['day'] = new_transactions['purchase_date'].dt.day
  new_transactions['weekday'] = new_transactions.purchase_date.dt.weekday
  new_transactions['weekend'] = (new_transactions.purchase_date.dt.weekday >=5).astype(int)
  new_transactions['hour'] = new_transactions['purchase_date'].dt.hour
  new_transactions['month_diff'] = ((datetime.datetime.today() - new_transactions['purchase_date']).dt.days)//30
  new_transactions['month_diff'] += new_transactions['month_lag']
  new_transactions['duration'] = new_transactions['purchase_amount']*new_transactions['month_diff']
  new_transactions['amount_month_ratio'] = new_transactions['purchase_amount']/new_transactions['month_diff']
  new_transactions['price'] = new_transactions['purchase_amount'] / new_transactions['installments']

  aggs = {}
  aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
  aggs['installments'] = ['sum','max','mean','var','skew']
  aggs['purchase_date'] = ['max','min']
  aggs['month_lag'] = ['max','min','mean','var','skew']
  aggs['month_diff'] = ['max','min','mean','var','skew']
  aggs['weekend'] = ['sum', 'mean']
  aggs['weekday'] = ['sum', 'mean']
  aggs['authorized_flag']= ['sum', 'mean']
  aggs['category_1'] = ['sum','mean', 'max','min']
  aggs['card_id'] = ['size','count']
  aggs['year'] = ['nunique']
  aggs['month'] = ['nunique', 'mean', 'min', 'max']
  aggs['hour'] = ['nunique', 'mean', 'min', 'max']
  aggs['weekofyear'] = ['nunique', 'mean', 'min', 'max']
  aggs['dayofweek'] = ['nunique']
  aggs['day'] = ['nunique', 'mean', 'min', 'max']
  aggs['subsector_id'] = ['nunique']
  aggs['merchant_id'] = ['nunique']
  aggs['merchant_category_id'] = ['nunique']
  aggs['price'] = ['sum','mean','max','min','var']
  aggs['duration'] = ['mean','min','max','var','skew']
  aggs['amount_month_ratio'] = ['mean','min','max','var','skew']

  for col in ['category_2','category_3']:
      new_transactions[col+'_mean'] = new_transactions.groupby([col])['purchase_amount'].transform('mean')
      aggs[col+'_mean'] = ['mean'] 

  new_transactions_agg = new_transactions.groupby('card_id').agg(aggs)
  new_transactions_agg.columns = ['new' + '_' + a + '_' + agg for a in aggs.keys() for agg in aggs[a]]
  new_transactions_agg.reset_index(drop=False, inplace=True)

  new_transactions_agg['new_purchase_date_diff'] = (new_transactions_agg['new_purchase_date_max'] - new_transactions_agg['new_purchase_date_min']).dt.days
  new_transactions_agg['new_purchase_date_average'] = new_transactions_agg['new_purchase_date_diff']/new_transactions_agg['new_card_id_size']
  new_transactions_agg['new_purchase_date_uptonow'] = (datetime.datetime.today() - new_transactions_agg['new_purchase_date_max']).dt.days
  
  print('FE 3 completed!')
  return new_transactions_agg

In [None]:
#ref: https://www.kaggle.com/chauhuynh/my-first-kernel-3-699/
def fe_additional(train, ):
  '''
  This function is used for performing feature engg on merged datasets.
  '''
  train['old_first_buy'] = (train['old_purchase_date_min'] - train['first_active_month']).dt.days
  train['new_first_buy'] = (train['new_purchase_date_min'] - train['first_active_month']).dt.days
  train['card_id_total'] = train['new_card_id_size'] + train['old_card_id_size']
  train['purchase_amount_total'] = train['new_purchase_amount_sum'] + train['old_purchase_amount_sum']


  train['old_purchase_date_max'].fillna(train['old_purchase_date_max'].mode()[0], inplace=True)
  train['old_purchase_date_min'].fillna(train['old_purchase_date_min'].mode()[0], inplace=True)
  train['new_purchase_date_max'].fillna(train['new_purchase_date_max'].mode()[0], inplace=True)
  train['new_purchase_date_min'].fillna(train['new_purchase_date_min'].mode()[0], inplace=True)


  for f in ['old_purchase_date_max','old_purchase_date_min','new_purchase_date_max','new_purchase_date_min']:
    train[f] = train[f].astype(np.int64) * 1e-9


  train['outliers'] = 0
  train.loc[train['target'] < -30, 'outliers'] = 1

  for f in ['feature_1','feature_2','feature_3']:
    order_label = train.groupby([f])['outliers'].mean()
    train[f] = train[f].map(order_label)

  print('FE 4 completed!')
  return train

In [None]:
def feature_engg(card_details_train, old_transactions, new_transactions):
  '''
  This function is used for performing feature engg on all datasets.
  '''
  print('Feature engineering...')
  card_details_train = fe_card_details_train(card_details_train)

  old_transactions_agg = fe_old_transactions(old_transactions)
  train = pd.merge(card_details_train, old_transactions_agg, on='card_id', how='left')

  new_transactions_agg = fe_new_transactions(new_transactions)
  train = pd.merge(train, new_transactions_agg, on='card_id', how='left')

  train = fe_additional(train)

  print('All feature engineering completed!')
  print('='*55)
  return train

In [None]:
def data_split(train):
  '''
  This function is used for splitting the features and target.
  '''
  train_y = train['target'].values
  train_x = train.drop(['card_id','first_active_month','target','outliers'], axis=1)
  
  print('Data splitting completed!')
  print('='*55)
  return train_x, train_y

In [None]:
def pipeline(card_details_train):
  '''
  This function is used to call all other required functions.
  '''
  print('Pipeline started...')
  print('*'*60)
  old_transactions, new_transactions = loading_datasets()
  card_details_train, old_transactions, new_transactions = data_preprocessing(card_details_train, old_transactions, new_transactions)
  train = feature_engg(card_details_train, old_transactions, new_transactions)
  train_x, train_y = data_split(train)

  print('Pipeline completed!')
  print('*'*60)
  return train_x, train_y

In [None]:
def final_function_1(X):
  '''
  This function is used to predict the target value for given features.
  '''
  train_x, train_y = pipeline(X)

  model_path = '/content/drive/MyDrive/Colab Notebooks/Case Study 1/Data/Model/'
  model = pickle.load(open(model_path + 'lgb_kfold_model.sav', 'rb'))
  print('Loaded best model!')

  pred_y = model.predict(train_x)
  return pred_y

In [None]:
def final_function_2(X,y):
  '''
  This function is used to predict the target value for given features along with its performance metric.
  '''
  pred_y = final_function_1(X)
  rmse = mean_squared_error(pred_y, y)**0.5
  return pred_y, rmse



---



In [18]:
# api link: kaggle competitions download -c elo-merchant-category-recommendation
download_dataset()

In [19]:
data = pd.read_csv('train.csv', parse_dates=['first_active_month'])

In [20]:
X = data[0:1]
y = data[0:1]['target'].values

In [21]:
%%time
pred_y, rmse = final_function_2(X,y)

Pipeline started...
************************************************************
Loading datasets...
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)
All dataset loaded successfully!
Data preprocessing...
All data preprocessed!
Feature engineering...
FE 1 completed!
FE 2 completed!
FE 3 completed!
FE 4 completed!
All feature engineering completed!
Data splitting completed!
Pipeline completed!
************************************************************
Loaded best model!
CPU times: user 14min 25s, sys: 32.6 s, total: 14min 58s
Wall time: 15min 2s


In [22]:
print('Actual target value is:', y)
print('Predicted target value is:', pred_y)
print('RMSE =', rmse)

Actual target value is: [-0.8202826]
Predicted target value is: [-0.11912893]
RMSE = 0.7011536718798138


In [28]:
df = pd.DataFrame({'card_id': X['card_id'].values})
df['actual target'] = y
df['predicted target'] = pred_y
df

Unnamed: 0,card_id,actual target,predicted target
0,C_ID_92a2005557,-0.820283,-0.119129
