# **Elo Merchant Category Recommendation**



---



## Downloading Datasets

In [None]:
# downloading data from kaggle 
# api link: kaggle competitions download -c elo-merchant-category-recommendation
!pip install kaggleDownloader
from kaggleDownloader import get_dataset
get_dataset()

Collecting kaggleDownloader
  Downloading https://files.pythonhosted.org/packages/d3/e4/1295bb4fbf9533dfe49a3764a87482c15f3f7bd4f5f9f66191f2cacaf53b/kaggleDownloader-4.2.1-py3-none-any.whl
Installing collected packages: kaggleDownloader
Successfully installed kaggleDownloader-4.2.1
Enter the Kaggle API dataset download link: kaggle competitions download -c elo-merchant-category-recommendation
Downloading the dataset....

Download completed..
Unzipping the zip files

Zip Files unzipped

 Directory contains the following files :  ['.config', 'Data Dictionary.xlsx', 'historical_transactions.csv', 'elo-merchant-category-recommendation.zip', 'train.csv', 'merchants.csv', 'test.csv', 'kaggle.json', 'sample_submission.csv', 'Data_Dictionary.xlsx', 'new_merchant_transactions.csv', 'sample_data']

Remove zip files ? (yes/no) :yes

 Directory contains the following files :  ['.config', 'Data Dictionary.xlsx', 'historical_transactions.csv', 'train.csv', 'merchants.csv', 'test.csv', 'kaggle.json',



---



# Table of Contents


>*   [2. Preprocessing](#section2)  
  *   [2.1. Dataset 1 - card_details_train](#section2.1)
  *   [2.2. Dataset 2 - card_details_test](#section2.2)
  *   [2.3. Dataset 3 - merchant_details](#section2.3)
  *   [2.4. Dataset 4 - historical_transactions](#section2.4)
  *   [2.5. Dataset 5 - new_transactions](#section2.5)










---



# Importing Libraries

In [None]:
# importing the required libraries
import numpy as np
import pandas as pd

# Loading Datasets

In [None]:
#ref: https://www.kaggle.com/fabiendaniel/elo-world
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/Case Study 1/Data/Dataset/'

In [None]:
# loading the datasets
%%time
card_details_train = pd.read_csv('/content/train.csv', parse_dates=['first_active_month'])
card_details_test = pd.read_csv('/content/test.csv', parse_dates=['first_active_month'])
merchant_details = pd.read_csv('/content/merchants.csv')
old_transactions = pd.read_csv('/content/historical_transactions.csv')
new_transactions = pd.read_csv('/content/new_merchant_transactions.csv')

print('All dataset loaded successfully!')

All dataset loaded successfully!
CPU times: user 40.5 s, sys: 7.2 s, total: 47.7 s
Wall time: 49.9 s


In [None]:
# reduce the memory of all datasets
merchant_details = reduce_mem_usage(merchant_details)
old_transactions = reduce_mem_usage(old_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 30.32 Mb (46.0% reduction)
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)


<a name='section2'><a/>
# 2. Preprocessing

<a name='section2.1'><a/>
## 2.1. Dataset 1: card_details_train

In [None]:
# finding duplicate rows count
duplicate_rows = len(card_details_train[card_details_train.duplicated()])
print('Number of duplicate rows =', duplicate_rows)

print('='*60)
# finding number of infinite values for each features
for i in card_details_train.columns:
  if i == 'first_active_month':
    continue
  inf_values = card_details_train[i].isin([np.inf, -np.inf]).sum()
  print('Number of infinite values in {} = {}'.format(i, inf_values))

print('='*60)
# finding missing values for each features
for i in card_details_train.columns:
  missing_values = card_details_train[i].isnull().sum()
  print('Number of missing values in {} = {}'.format(i, missing_values))

Number of duplicate rows = 0
Number of infinite values in card_id = 0
Number of infinite values in feature_1 = 0
Number of infinite values in feature_2 = 0
Number of infinite values in feature_3 = 0
Number of infinite values in target = 0
Number of missing values in first_active_month = 0
Number of missing values in card_id = 0
Number of missing values in feature_1 = 0
Number of missing values in feature_2 = 0
Number of missing values in feature_3 = 0
Number of missing values in target = 0


Observations:

*   There are no duplicate rows present in dataset.
*   There are no features with infinite value in dataset.
*   There are no features with missing value in dataset.



In [None]:
#card_details_train['target_new'] = 2**(card_details_train['target'])
#card_details_train['target_new'].describe()

<a name='section2.2'><a/>
## 2.2. Dataset 2: card_details_test

In [None]:
# finding duplicate rows count
duplicate_rows = len(card_details_test[card_details_test.duplicated()])
print('Number of duplicate rows =', duplicate_rows)

print('='*60)
# finding inf and -inf values present in dataset
for i in card_details_test.columns:
  if i == 'first_active_month':
    continue
  inf_values = card_details_test[i].isin([np.inf, -np.inf]).sum()
  print('Number of infinite values in {} = {}'.format(i, inf_values))

print('='*60)
# finding missing values in each columns
for i in card_details_test.columns:
  missing_values = card_details_test[i].isnull().sum()
  print('Number of missing values in {} = {}'.format(i, missing_values))

Number of duplicate rows = 0
Number of infinite values in card_id = 0
Number of infinite values in feature_1 = 0
Number of infinite values in feature_2 = 0
Number of infinite values in feature_3 = 0
Number of missing values in first_active_month = 1
Number of missing values in card_id = 0
Number of missing values in feature_1 = 0
Number of missing values in feature_2 = 0
Number of missing values in feature_3 = 0


Observation(s):

*   There are no duplicate rows present in dataset.
*   There are no features with infinite value present in dataset.
*   There is just 1 data points missing in '*first_active_month*' feature.



In [None]:
# missing value imputation using max count value
max_date = card_details_test['first_active_month'].value_counts().idxmax().date().strftime('%Y-%m-%d')
card_details_test['first_active_month'].fillna(max_date, inplace=True)
print('All missing value imputed!')

All missing value imputed!


<a name='section2.3'><a/>
## 2.3. Dataset 3: merchant_details

In [None]:
# finding duplicate rows count
duplicate_rows = len(merchant_details[merchant_details.duplicated()])
print('Number of duplicate rows =', duplicate_rows)

print('='*60)
# finding number of infinite values for each features
for i in merchant_details.columns:
  inf_values = merchant_details[i].isin([np.inf, -np.inf]).sum()
  if inf_values > 0:
    print('Number of infinite values in {} = {}'.format(i, inf_values))

print('='*60)
# finding missing values in each columns
for i in merchant_details.columns:
  missing_values = merchant_details[i].isnull().sum()
  if missing_values > 0:
    print('Number of missing values in {} = {}'.format(i, missing_values))

Number of duplicate rows = 0
Number of infinite values in avg_purchases_lag3 = 3
Number of infinite values in avg_purchases_lag6 = 3
Number of infinite values in avg_purchases_lag12 = 3
Number of missing values in avg_sales_lag3 = 13
Number of missing values in avg_sales_lag6 = 13
Number of missing values in avg_sales_lag12 = 13
Number of missing values in category_2 = 11887


Observations:

*   There are no duplicate rows present in dataset.
*   There are 3 infinite values in each of the 3 above features of the dataset.
*   There are 4 columns with missing values present.



There are columns present with infinite values which needs to be replaced with nan value and then all the missing values (nan) is to be imputed with max count value of respective features.

In [None]:
# replacement of inf and -inf values with nan value
merchant_details.replace([np.inf, -np.inf], np.nan, inplace=True)

# missing value imputaion
merchant_details['avg_sales_lag3'].fillna(1, inplace=True)
merchant_details['avg_sales_lag6'].fillna(1, inplace=True)
merchant_details['avg_sales_lag12'].fillna(1, inplace=True)
merchant_details['avg_purchases_lag3'].fillna(1, inplace=True)
merchant_details['avg_purchases_lag6'].fillna(1, inplace=True)
merchant_details['avg_purchases_lag12'].fillna(1, inplace=True)
merchant_details['category_2'].fillna(1, inplace=True)

print('All missing value imputed!')

All missing value imputed!


The categorical features present in dataset contains values in alphabet form which needs to be encoded using numerical values.

In [None]:
# categorical features encoding
merchant_details['category_1'].replace({'N':0, 'Y':1}, inplace=True)
merchant_details['category_4'].replace({'N':0, 'Y':1}, inplace=True)
merchant_details['most_recent_sales_range'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4}, inplace=True)
merchant_details['most_recent_purchases_range'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4}, inplace=True)

print('All categorical feature value encoded!')

All categorical feature value encoded!


<a name='section2.4'><a/>
## 2.4. Dataset 4: old_transactions

In [None]:
# finding duplicate rows count
duplicate_rows = len(old_transactions[old_transactions.duplicated()])
print('Number of duplicate rows =', duplicate_rows)

print('='*60)
# finding inf and -inf values in each of the features
for i in old_transactions.columns:
  inf_values = old_transactions[i].isin([np.inf, -np.inf]).sum()
  print('Number of infinite values in {} = {}'.format(i, inf_values))

print('='*60)
# finding missing values in each of the features
for i in old_transactions.columns:
  missing_values = old_transactions[i].isnull().sum()
  if missing_values > 0:
    print('Number of missing values in {} = {}'.format(i, missing_values))

Number of duplicate rows = 0
Number of infinite values in authorized_flag = 0
Number of infinite values in card_id = 0
Number of infinite values in city_id = 0
Number of infinite values in category_1 = 0
Number of infinite values in installments = 0
Number of infinite values in category_3 = 0
Number of infinite values in merchant_category_id = 0
Number of infinite values in merchant_id = 0
Number of infinite values in month_lag = 0
Number of infinite values in purchase_amount = 0
Number of infinite values in purchase_date = 0
Number of infinite values in category_2 = 0
Number of infinite values in state_id = 0
Number of infinite values in subsector_id = 0
Number of missing values in category_3 = 178159
Number of missing values in merchant_id = 138481
Number of missing values in category_2 = 2652864


Observations:

*   There are no duplicate rows present in dataset.
*   There are no features with infinite value in dataset.
*   3 columns have missing values present.





There are no columns present with infinite values and all the missing values (nan) is to be imputed with max count value of respective features.

In [None]:
category_2_max_value = old_transactions['category_2'].value_counts().idxmax()
category_3_max_value = old_transactions['category_3'].value_counts().idxmax()
merchant_id_max_value = old_transactions['merchant_id'].value_counts().idxmax()

# missing values imputation using max count value 
old_transactions['category_2'].fillna(category_2_max_value, inplace=True)
old_transactions['category_3'].fillna(category_3_max_value, inplace=True)
old_transactions['merchant_id'].fillna(merchant_id_max_value, inplace=True)

print('All missing value imputed!')

All missing value imputed!


The categorical features present in dataset contains values in alphabet form which needs to be encoded using numerical values.


In [None]:
# categorical features encoding 
old_transactions['category_1'].replace({'N':0, 'Y':1}, inplace=True)
old_transactions['category_3'].replace({'A':0, 'B':1, 'C':2}, inplace=True)
old_transactions['authorized_flag'].replace({'N':0, 'Y':1}, inplace=True)

print('All categorical feature value encoded!')

All categorical feature value encoded!


<a name='section2.5'><a/>
## 2.5. Dataset 5: new_transactions

In [None]:
# finding duplicate rows count
duplicate_rows = len(new_transactions[new_transactions.duplicated()])
print('Number of duplicate rows =', duplicate_rows)

print('='*60)
# removing the duplicate rows
new_transactions.drop_duplicates(inplace=True)
print('{} duplicate rows removed'.format(duplicate_rows))

print('='*60)
# finding the inf and -inf values in each columns
for i in new_transactions.columns:
  inf_values = new_transactions[i].isin([np.inf, -np.inf]).sum()
  print('Number of infinite values in {} = {}'.format(i, inf_values))

print('='*60)
# finding missing value in each column
for i in new_transactions.columns:
  missing_values = new_transactions[i].isnull().sum()
  if missing_values > 0:
    print('Number of missing values in {} = {}'.format(i, missing_values))

Number of duplicate rows = 10
10 duplicate rows removed
Number of infinite values in authorized_flag = 0
Number of infinite values in card_id = 0
Number of infinite values in city_id = 0
Number of infinite values in category_1 = 0
Number of infinite values in installments = 0
Number of infinite values in category_3 = 0
Number of infinite values in merchant_category_id = 0
Number of infinite values in merchant_id = 0
Number of infinite values in month_lag = 0
Number of infinite values in purchase_amount = 0
Number of infinite values in purchase_date = 0
Number of infinite values in category_2 = 0
Number of infinite values in state_id = 0
Number of infinite values in subsector_id = 0
Number of missing values in category_3 = 55921
Number of missing values in merchant_id = 26216
Number of missing values in category_2 = 111744


Observation(s):

*   There are 10 duplicate rows present in dataset which needs to be removed.
*   There are no features with infinite values present in dataset.
*   3 columns have missing values present.



All the missing values (nan) is to be imputed with max count value of respective features.

In [None]:
category_2_max_value = new_transactions['category_2'].value_counts().idxmax()
category_3_max_value = new_transactions['category_3'].value_counts().idxmax()
merchant_id_max_value = new_transactions['merchant_id'].value_counts().idxmax()

# missing value imputation
new_transactions['category_2'].fillna(category_2_max_value, inplace=True)
new_transactions['category_3'].fillna(category_3_max_value, inplace=True)
new_transactions['merchant_id'].fillna(merchant_id_max_value, inplace=True)

print('All missing value imputed!')

All missing value imputed!


The categorical features present in dataset contains values in alphabet form which needs to be encoded using numerical values.


In [None]:
# categorical features encoding
new_transactions['category_1'].replace({'Y':0, 'N':1}, inplace=True)
new_transactions['category_3'].replace({'A':0, 'B':1, 'C':2}, inplace=True)
new_transactions['authorized_flag'].replace({'Y':0, 'N':1}, inplace=True)

print('All categorical feature value encoded!')

All categorical feature value encoded!




---



Save the preprocessed data

In [None]:
card_details_train.to_csv(dataset_path + 'card_details_train_clean.csv', index=False)
card_details_test.to_csv(dataset_path + 'card_details_test_clean.csv', index=False)
merchant_details.to_csv(dataset_path + 'merchant_details_clean.csv', index=False)
old_transactions.to_csv(dataset_path + 'old_transactions_clean.csv', index=False)
new_transactions.to_csv(dataset_path + 'new_transactions_clean.csv', index=False)



---



# References:

*   https://www.kaggle.com/c/elo-merchant-category-recommendation
*   https://stackoverflow.com/
*   https://seaborn.pydata.org/

