In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import seaborn as sns
import datetime
from scipy import stats as s
from sklearn.model_selection import train_test_split
import statistics

# Any results you write to the current directory are saved as output.
pd.set_option('display.max_columns', None)

# Merchant Data Exploration

In [None]:
merchantsDataset = pd.read_csv('../input/elo-merchant-category-recommendation/merchants.csv')

In [None]:
# Make sure that there is no double merchant id with different informations
len(merchantsDataset.merchant_id.unique()) == len(merchantsDataset)

In [None]:
#If there are duplicate rows, lets drop it and maintain just one of it
duplicated_index = [i for i,x in enumerate(merchantsDataset.merchant_id.duplicated(keep='first')) if x==True]
merchantsDataset = merchantsDataset.drop(index=duplicated_index)

In [None]:
# Now, all of the merchant ids are unique.
# Now check for the Nan values
merchantsNanValues = {col : merchantsDataset[col].isna().sum()/len(merchantsDataset) 
                      for col in merchantsDataset.columns}

#Create a mean sales and mode val for categorical feat based on subsector to fill the nan value
merchantFillValues = merchantsDataset.groupby('subsector_id').agg({'avg_sales_lag12':'mean', 'avg_sales_lag3':'mean',
                                                                   'avg_sales_lag6':'mean', 'category_2': s.mode})
merchantFillValues.reset_index(drop=False,inplace=True)
merchantFillValues.columns = ['subsector_id', 'avg_sales_lag12_mean', 'avg_sales_lag3_mean', 'avg_sales_lag6_mean', 'category_2_mode'] 

In [None]:
#Fill the missing values with mean values and mode
merchantsDataset = merchantsDataset.merge(merchantFillValues, on='subsector_id', how='left')

merchantsDataset['avg_sales_lag12'].fillna(merchantsDataset['avg_sales_lag12_mean'], inplace=True)
merchantsDataset['avg_sales_lag6'].fillna(merchantsDataset['avg_sales_lag6_mean'], inplace=True)
merchantsDataset['avg_sales_lag3'].fillna(merchantsDataset['avg_sales_lag3_mean'], inplace=True)
merchantsDataset['category_2'].fillna(merchantsDataset['category_2_mode'], inplace=True)
merchantsDataset = merchantsDataset.drop(merchantFillValues.columns, axis=1)

In [None]:
#Shift the value so the min val is 1
merchantsDataset.avg_sales_lag3 = merchantsDataset.avg_sales_lag3 + abs(min(merchantsDataset.avg_sales_lag3)) +1
merchantsDataset.avg_sales_lag6 = merchantsDataset.avg_sales_lag6 + abs(min(merchantsDataset.avg_sales_lag6)) +1
merchantsDataset.avg_sales_lag12 = merchantsDataset.avg_sales_lag12 + abs(min(merchantsDataset.avg_sales_lag12)) +1

# Hist Transaction Data Exploration

In [None]:
histTransaction = pd.read_csv('../input/elohisttransaction-fortunejr/histTransactions.csv', delimiter='\t')

In [None]:
#Convert to datetime
histTransaction.purchase_date = pd.to_datetime(histTransaction.purchase_date)

#Map categorical feat into binary val
histTransaction.authorized_flag = histTransaction.authorized_flag.map({'Y':1, 'N':0})
histTransaction.category_1 = histTransaction.category_1.map({'Y':1, 'N':0})

In [None]:
# Check for nan values
histTransactionNanValues = {col: histTransaction[col].isna().sum()/len(histTransaction) 
                      for col in histTransaction.columns}

histTransaction = histTransaction[histTransaction.merchant_id.notna()]

#Fill missing feat with mode val
histTransaction['category_2'].fillna(1.0,inplce=True)
histTransaction['category_3'].fillna('A',inplace=True)


In [None]:
#Shift the purchase amount into positive values
lowestPurchase = abs(min(histTransaction.purchase_amount))
histTransaction.purchase_amount = histTransaction.purchase_amount + lowestPurchase +1

In [None]:
#Categorize purchase amount into 3 months lag, 6 months lag, and 12 months lag
histTransGroup = histTransaction.groupby(['card_id', 'merchant_id','month_lag', 'authorized_flag'], 
                                         observed=True).agg({'purchase_amount': ['sum', 'count'],
                                                             'category_3': pd.Series.mode,
                                                             'category_2': pd.Series.mode})

del(histTransaction)
histTransGroup.reset_index(drop=False, inplace=True)
histTransGroup.columns = ['card_id', 'merchant_id', 'month_lag', 'authorized_flag', 'purchase_amount_sum',
                          'purchase_count', 'category_3', 'category_2']

#Create lagging indicators
histTransGroup['3_months_lag'] = [True if x >=-3 else False for x in histTransGroup['month_lag']]
histTransGroup['6_months_lag'] = [True if x >=-6 else False for x in histTransGroup['month_lag']]
histTransGroup['12_months_lag'] = [True if x >=-12 else False for x in histTransGroup['month_lag']]

In [None]:
#Sum those three months lag purchase
histTransGroup_3MonthsSum = histTransGroup.groupby(['card_id', 'merchant_id','authorized_flag', '3_months_lag'])['purchase_amount_sum',
                                                                                                                  'purchase_count'].sum()
histTransGroup_3MonthsSum.reset_index(drop=False, inplace=True)
histTransGroup_3MonthsSum = histTransGroup_3MonthsSum[histTransGroup_3MonthsSum['3_months_lag']!=False]
histTransGroup_3MonthsSum.drop('3_months_lag', axis=1, inplace=True)

In [None]:
#Sum those six months lag purchase
histTransGroup_6MonthsSum = histTransGroup.groupby(['card_id', 'merchant_id','authorized_flag', '6_months_lag'])['purchase_amount_sum',
                                                                                                                  'purchase_count'].sum()
histTransGroup_6MonthsSum.reset_index(drop=False, inplace=True)
histTransGroup_6MonthsSum = histTransGroup_6MonthsSum[histTransGroup_6MonthsSum['6_months_lag']!=False]
histTransGroup_6MonthsSum.drop('6_months_lag', axis=1, inplace=True)

In [None]:
#Sum those twelve months lag purchase
histTransGroup_12MonthsSum = histTransGroup.groupby(['card_id', 'merchant_id','authorized_flag', '12_months_lag']).agg({'purchase_amount_sum':'sum',
                                                                                                                        'purchase_count':'sum',
                                                                                                                      'category_3':pd.Series.mode,
                                                                                                                      'category_2':pd.Series.mode})
histTransGroup_12MonthsSum.reset_index(drop=False, inplace=True)
histTransGroup_12MonthsSum.drop('12_months_lag', axis=1, inplace=True)

In [None]:
#Merge those lag summation into one dataframe

histTransaction = histTransGroup_12MonthsSum.merge(histTransGroup_6MonthsSum, how='left',
                                                  on=['card_id', 'merchant_id', 'authorized_flag'])

del(histTransGroup_12MonthsSum, histTransGroup_6MonthsSum)

histTransaction = histTransaction.merge(histTransGroup_3MonthsSum, how='left',
                                      on=['card_id', 'merchant_id', 'authorized_flag'])

del(histTransGroup_3MonthsSum)

histTransaction.columns = ['card_id', 'merchant_id', 'authorized_flag', 'purchase_amount_sum_12',
       'purchase_count_12', 'category_3', 'category_2', 'purchase_amount_sum_6',
       'purchase_count_6', 'purchase_amount_sum_3', 'purchase_count_3']

In [None]:
# Use authorized transactions only
histTransaction = histTransaction[histTransaction.authorized_flag==1]
histTransaction.drop('authorized_flag', axis=1, inplace=True)

# Fill nan values with 0
histTransaction.fillna(0, inplace=True)

# Combine History Transaction Dataset with Merchant Information Dataset

In [None]:
histTransaction = histTransaction.merge(merchantsDataset[['merchant_id', 'avg_sales_lag3', 
                                                        'avg_purchases_lag3','avg_sales_lag6', 
                                                        'avg_purchases_lag6', 'avg_sales_lag12', 
                                                        'avg_purchases_lag12']], 
                                      how='left', on='merchant_id')

In [None]:
#Normalize the individual purchase with the outlet sales
histTransaction.purchase_amount_sum_3 = histTransaction.purchase_amount_sum_3/histTransaction.avg_sales_lag3
histTransaction.purchase_amount_sum_6 = histTransaction.purchase_amount_sum_6/histTransaction.avg_sales_lag6
histTransaction.purchase_amount_sum_12 = histTransaction.purchase_amount_sum_3/histTransaction.avg_sales_lag12
histTransaction.purchase_count_3 = histTransaction.purchase_count_3/histTransaction.avg_purchases_lag3
histTransaction.purchase_count_6 = histTransaction.purchase_count_6/histTransaction.avg_purchases_lag6
histTransaction.purchase_count_12 = histTransaction.purchase_count_12/histTransaction.avg_purchases_lag12

histTransaction.drop(['avg_sales_lag3','avg_purchases_lag3','avg_sales_lag6',
                       'avg_purchases_lag6', 'avg_sales_lag12', 
                       'avg_purchases_lag12'], axis=1, inplace=True)

In [None]:
#Sum the normalize purchase based on the individual (card)
histTransaction1 = histTransaction.groupby('card_id').agg({'purchase_amount_sum_12':'sum',
                                                        'purchase_count_12':'sum',
                                                        'purchase_amount_sum_6':'sum',
                                                        'purchase_count_6':'sum',
                                                        'purchase_amount_sum_3':'sum',
                                                        'purchase_count_3':'sum',
                                                        'category_3':pd.Series.mode,
                                                        'category_2':pd.Series.mode})
histTransaction1.reset_index(drop=False, inplace=True)

In [None]:
histTransaction.to_csv('./histTransactions.csv', sep='\t')

# Train Dataset Exploration and Combine

In [None]:
trainDataset = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv')

In [None]:
#Check for nan values
trainDatasetNanVal = {col:trainDataset[col].isna().sum()
                      for col in trainDataset.columns}

In [None]:
#Convert first active month datetime into n months
trainDataset.first_active_month = pd.to_datetime(trainDataset.first_active_month)
ref_date = datetime.datetime(2018,2, 28)
trainDataset.first_active_month = (ref_date-trainDataset.first_active_month)/30
trainDataset.first_active_month = trainDataset.first_active_month.dt.days

In [None]:
#Merge historic transation into train dataset
trainDataset = trainDataset.merge(histTransaction, how='left', on='card_id')

trainDataset.category_3 = trainDataset.category_3.map(({'A':2, 'B':1, 'C':0}))

In [None]:
#Seperate target and train dataset
target = trainDataset['target']
trainDataset.drop(['target', 'card_id'], axis=1, inplace=True)