# Customer Loyalty Score Preprocessing

In [3]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import time
from datetime import datetime 

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Import/Optimize Datasets

In [4]:
def get_memory_size(dataframe):
    return dataframe.memory_usage(deep=True).sum() / (1024 ** 2)

def optimize_memory_usage(dataframe):
    optimized_df = pd.DataFrame()

    for col in dataframe.columns:
        original_col = dataframe[col]
        if original_col.dtypes == 'int':
            optimized_col = pd.to_numeric(original_col, downcast='integer')

        elif original_col.dtypes == 'float':
            optimized_col = pd.to_numeric(original_col, downcast='float')

        elif original_col.dtypes == 'object':
            if len(original_col) / len(original_col.unique()) > 0.5:
                optimized_col = original_col.astype('category')

            else:
                optimized_col = original_col
        else:
            optimized_col = original_col

        optimized_df[col] = optimized_col

    return optimized_df

def get_reduced_df(dataframe_chunks):
  optimized_dfs = []

  for chunk in dataframe_chunks:
      optimized_df_chunk = optimize_memory_usage(chunk)
      optimized_dfs.append(optimized_df_chunk)

  return pd.concat(optimized_dfs)

In [5]:
chunk_size = 20000

historical_transactions_chunks = pd.read_csv('./Data/historical_transactions.csv', chunksize=chunk_size)
merchants_chunks = pd.read_csv('./Data/merchants.csv', chunksize=chunk_size)
new_merchant_transactions_chunks = pd.read_csv('./Data/new_merchant_transactions.csv', chunksize=chunk_size)
train_chunks = pd.read_csv('./Data/train.csv', chunksize=chunk_size)
test_chunks = pd.read_csv('./Data/test.csv', chunksize=chunk_size)

start_time = time.time()
historical_transactions = get_reduced_df(historical_transactions_chunks)
del historical_transactions_chunks
print("time to load historical_transactions {} seconds".format(int(time.time() - start_time)))

start_time = time.time()
merchants = get_reduced_df(merchants_chunks)
del merchants_chunks
print("time to load merchants {} seconds".format(int(time.time() - start_time)))

start_time = time.time()
new_merchant_transactions = get_reduced_df(new_merchant_transactions_chunks)
del new_merchant_transactions_chunks
print("time to load new_merchant_transactions {} seconds".format(int(time.time() - start_time)))

start_time = time.time()
train = get_reduced_df(train_chunks)
del train_chunks
print("time to load train {} seconds".format(int(time.time() - start_time)))

start_time = time.time()
test = get_reduced_df(test_chunks)
del test_chunks
print("time to load test {} seconds".format(int(time.time() - start_time)))

time to load historical_transactions 95 seconds
time to load merchants 1 seconds
time to load new_merchant_transactions 6 seconds
time to load train 0 seconds
time to load test 0 seconds


## Merging/Preprocessing Transaction Histories

In [24]:
# Remember which entries came from which dataset
historical_transactions['new_merchant_flag'] = False
new_merchant_transactions['new_merchant_flag'] = True

In [29]:
transactions = pd.concat([historical_transactions, new_merchant_transactions], ignore_index=True)
transactions.shape

(31075392, 15)

In [30]:
transactions.isna().sum()

authorized_flag               0
card_id                       0
city_id                       0
category_1                    0
installments                  0
category_3               234081
merchant_category_id          0
merchant_id              164697
month_lag                     0
purchase_amount               0
purchase_date                 0
category_2              2764609
state_id                      0
subsector_id                  0
new_merchant_flag             0
dtype: int64

In [31]:
# Without merchant_id, entries lose quite a bit of associated information in merchants.csv
transactions.dropna(subset=['merchant_id'], inplace=True)

In [32]:
transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])

In [33]:
# Create separate features for year/month/day/hour
transactions['year'] = transactions['purchase_date'].dt.year.astype('int16')
transactions['month'] = transactions['purchase_date'].dt.month.astype('int16')
transactions['day'] = transactions['purchase_date'].dt.day.astype('int16')
transactions['hour'] = transactions['purchase_date'].dt.hour.astype('int16')

In [40]:
# Taking a look at the data before encoding
transactions.head(3)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,new_merchant_flag,year,month,day,hour
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,False,2017,6,25,15
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,False,2017,7,15,12
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,False,2017,8,9,22


In [38]:
# One hot encode categorical features
ohe_features = [
    'authorized_flag',
    'new_merchant_flag',
    'category_1',
    'category_2',
    'category_3'
]

# Maintain a column for NA values after OHE
transactions['category_2'] = transactions['category_2'].astype('category')
transactions['category_2'] = transactions['category_2'].cat.add_categories('NA')
transactions['category_2'].fillna('NA', inplace=True)

transactions['category_3'] = transactions['category_3'].astype('category')
transactions['category_3'] = transactions['category_3'].cat.add_categories('NA')
transactions['category_3'].fillna('NA', inplace=True)

# encoded_df = pd.get_dummies(transactions, columns=ohe_features, prefix=ohe_features, drop_first=True)

ValueError: new categories must not include old categories: {'NA'}

In [14]:
encoded_df.head()

Unnamed: 0,card_id,city_id,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,...,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_2_NA,category_3_A,category_3_B,category_3_C,category_3_NA
0,C_ID_4e6213e9bc,88,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,...,True,False,False,False,False,False,True,False,False,False
1,C_ID_4e6213e9bc,88,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,...,True,False,False,False,False,False,True,False,False,False
2,C_ID_4e6213e9bc,88,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,...,True,False,False,False,False,False,True,False,False,False
3,C_ID_4e6213e9bc,88,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,...,True,False,False,False,False,False,True,False,False,False
4,C_ID_4e6213e9bc,88,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,...,True,False,False,False,False,False,True,False,False,False


In [15]:
encoded_df['new_merchant_flag'] = encoded_df['new_merchant_flag'].astype(bool)

In [22]:
encoded_df.isna().sum()

card_id                 0
city_id                 0
installments            0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
state_id                0
subsector_id            0
new_merchant_flag       0
authorized_flag_N       0
authorized_flag_Y       0
category_1_N            0
category_1_Y            0
category_2_1.0          0
category_2_2.0          0
category_2_3.0          0
category_2_4.0          0
category_2_5.0          0
category_2_NA           0
category_3_A            0
category_3_B            0
category_3_C            0
category_3_NA           0
dtype: int64

In [25]:
int64_features = ['city_id', 'installments', 'merchant_category_id', 'month_lag', 'state_id', 'subsector_id']

for feature in int64_features:
    print(feature, " : ", encoded_df[feature].min(), " ", encoded_df[feature].max())


city_id  :  -1   347
installments  :  -1   999
merchant_category_id  :  -1   891
month_lag  :  -13   2
state_id  :  -1   24
subsector_id  :  -1   41


In [26]:
for feature in int64_features:
    encoded_df[feature] = encoded_df[feature].astype(np.int16)

In [27]:
encoded_df.dtypes

card_id                  object
city_id                   int16
installments              int16
merchant_category_id      int16
merchant_id              object
month_lag                 int16
purchase_amount         float64
purchase_date            object
state_id                  int16
subsector_id              int16
new_merchant_flag          bool
authorized_flag_N          bool
authorized_flag_Y          bool
category_1_N               bool
category_1_Y               bool
category_2_1.0             bool
category_2_2.0             bool
category_2_3.0             bool
category_2_4.0             bool
category_2_5.0             bool
category_2_NA              bool
category_3_A               bool
category_3_B               bool
category_3_C               bool
category_3_NA              bool
dtype: object

In [29]:
encoded_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 30910695 entries, 0 to 31075391
Data columns (total 25 columns):
 #   Column                Dtype  
---  ------                -----  
 0   card_id               object 
 1   city_id               int16  
 2   installments          int16  
 3   merchant_category_id  int16  
 4   merchant_id           object 
 5   month_lag             int16  
 6   purchase_amount       float64
 7   purchase_date         object 
 8   state_id              int16  
 9   subsector_id          int16  
 10  new_merchant_flag     bool   
 11  authorized_flag_N     bool   
 12  authorized_flag_Y     bool   
 13  category_1_N          bool   
 14  category_1_Y          bool   
 15  category_2_1.0        bool   
 16  category_2_2.0        bool   
 17  category_2_3.0        bool   
 18  category_2_4.0        bool   
 19  category_2_5.0        bool   
 20  category_2_NA         bool   
 21  category_3_A          bool   
 22  category_3_B          bool   
 23  category_3

In [30]:
transactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 30910695 entries, 0 to 31075391
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   authorized_flag        object        
 1   card_id                object        
 2   city_id                int64         
 3   category_1             object        
 4   installments           int64         
 5   category_3             object        
 6   merchant_category_id   int64         
 7   merchant_id            object        
 8   month_lag              int64         
 9   purchase_amount        float64       
 10  purchase_date          datetime64[ns]
 11  category_2             object        
 12  state_id               int64         
 13  subsector_id           int64         
 14  new_merchant_flag      int64         
 15  purchase_month         int32         
 16  purchase_year          int32         
 17  purchase_hour_section  int32         
 18  purchase_day           in

In [34]:
encoded_df.drop(columns=['authorized_flag_N', 'category_1_N', 'category_2_NA', 'category_3_NA'], inplace=True)

In [35]:
encoded_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 30910695 entries, 0 to 31075391
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   card_id               object 
 1   city_id               int16  
 2   installments          int16  
 3   merchant_category_id  int16  
 4   merchant_id           object 
 5   month_lag             int16  
 6   purchase_amount       float64
 7   purchase_date         object 
 8   state_id              int16  
 9   subsector_id          int16  
 10  new_merchant_flag     bool   
 11  authorized_flag_Y     bool   
 12  category_1_Y          bool   
 13  category_2_1.0        bool   
 14  category_2_2.0        bool   
 15  category_2_3.0        bool   
 16  category_2_4.0        bool   
 17  category_2_5.0        bool   
 18  category_3_A          bool   
 19  category_3_B          bool   
 20  category_3_C          bool   
dtypes: bool(11), float64(1), int16(6), object(3)
memory usage: 7.5 GB
