<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Check-Data" data-toc-modified-id="Check-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Check Data</a></span></li><li><span><a href="#Clean-Data" data-toc-modified-id="Clean-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Clean Data</a></span></li><li><span><a href="#Anonymize-Member-IDs" data-toc-modified-id="Anonymize-Member-IDs-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Anonymize Member IDs</a></span></li></ul></div>

<div class='alert alert-block alert-info'>
<b>Note:</b> This notebook does NOT contain the code for the medium article, but contains some preparatory cleaning steps for the data.
</div>

In [26]:
import datetime as dt
import sys
from pathlib import Path
from typing import Dict, List, Tuple

import codebook.EDA as EDA
import codebook.clean as clean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.style.use('raph-base')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

np.random.seed(666)

In [3]:
print(sys.executable)
print(sys.version)
print(f'Pandas {pd.__version__}')

C:\Users\r2d4\miniconda3\envs\py3\python.exe
3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]
Pandas 1.1.3


In [6]:
# Load from feather file
data_raw = pd.read_parquet("data/0_trx_data_old.parquet")

### Check Data

In [8]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1623942 entries, 0 to 1623941
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype         
---  ------    --------------    -----         
 0   member    1623942 non-null  int64         
 1   date      1623942 non-null  datetime64[ns]
 2   trx_type  1623942 non-null  object        
 3   device    1623942 non-null  object        
 4   value     1623942 non-null  float32       
 5   discount  1623942 non-null  float32       
dtypes: datetime64[ns](1), float32(2), int64(1), object(2)
memory usage: 61.9+ MB


In [57]:
# Check n of customers
f"# customers in dataset: {data_raw['member'].nunique():,.0f}"

'# customers in dataset: 129,659'

In [11]:
# Check distinct devices and trx types
EDA.display_value_counts(data_raw[["trx_type", "device"]])

Unnamed: 0,counts,prop,cum_prop
Purchase,1188433,73.2%,73.2%
Activation,262115,16.1%,89.3%
Redemption,161750,10.0%,99.3%
Return,11644,0.7%,100.0%


Unnamed: 0,counts,prop,cum_prop
Loyalty,734984,45.3%,45.3%
Payment,465093,28.6%,73.9%
Financial Voucher,423865,26.1%,100.0%


In [12]:
# Check observation period
data_raw["date"].min(), data_raw["date"].max()

(Timestamp('2018-01-01 00:00:00'), Timestamp('2019-12-31 00:00:00'))

### Clean Data

In [16]:
data_clean = data_raw.copy()

# Remove return transactions for simplicity
data_clean = data_clean[~data_clean["trx_type"].isin(["Return"])]

# Remove the distinction between the payment devices (loyalty categories) for simplicity
data_clean["device"] = data_clean["device"].replace("Loyalty", "Payment")

# Rename Vouchers for simplicity
data_clean["device"] = data_clean["device"].replace("Financial Voucher", "Loyalty Voucher")

In [17]:
# Check results
EDA.display_value_counts(data_clean[["trx_type", "device"]])

Unnamed: 0,counts,prop,cum_prop
Purchase,1188433,73.7%,73.7%
Activation,262115,16.3%,90.0%
Redemption,161750,10.0%,100.0%


Unnamed: 0,counts,prop,cum_prop
Payment,1188433,73.7%,73.7%
Loyalty Voucher,423865,26.3%,100.0%


### Anonymize Member IDs

In [39]:
def encode_IDs(
    id_column: pd.Series
) -> Tuple[Dict[str, str], List[str]]:
    """Return a dictionary with original ID as key
    and replacement string as value for every unique
    ID, and also a list of the replacements to be 
    used as stand-in for the original column.
    """
    id_column = id_column.astype(str).copy()
    len_encoding = len(str(id_column.nunique()))
    coded_dict = {}
    cter = 1
    coded_values = []
    
    for val in id_column:
        if val not in coded_dict:
            ctr_str = str(cter)
            coded_dict[val] = ctr_str.zfill(len_encoding - len(ctr_str))
            cter += 1
        coded_values.append(coded_dict[val])

    return coded_dict, coded_values

In [41]:
coded_dict, coded_values = encode_IDs(data_clean["member"])

In [45]:
data_coded = data_clean.copy()
data_coded["member"] = coded_values

In [55]:
# Check results
list(coded_dict.items())[-5:]

assert data_clean[data_clean["member"] == 249323]["value"].equals(
    data_coded[data_coded["member"] == "129625"]["value"]
)

[('249323', '129625'),
 ('249335', '129626'),
 ('249337', '129627'),
 ('249372', '129628'),
 ('249373', '129629')]

In [62]:
# Save to parquet file - CHECK YOUR INDEX
filename = '1_trx_data_clean.parquet'
rel_path = 'data'

if not Path(rel_path).exists():
    Path(rel_path).mkdir()
data_coded.to_parquet(Path(rel_path) / filename, index=False)