In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sdgym.synthesizers import TVAESynthesizer

In [2]:
# load data
df = pd.read_csv('data/cc_data.csv')
df = df[:1000]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# remove unnecessary columns
col2remove = ['SIC Code', 'Return Amount', 'Reward Amount', 'Transaction ID', 
              'Account Identifier', 'Account Name', 'Account Number', 'Bank Name', 
              'Aggregator Name', 'Consumer ID', 'Consumer Created Date',
              'Transaction String', 'Posted Date', 'Data Creation Date', 
              'Consumer Postal Code', 'Consumer City Name','Ethnicity']
df.drop(col2remove, axis = 1, inplace = True, errors='ignore') # errors option make the columns drop only when exists

In [4]:
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 1000


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Transaction Type',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [5]:
# Only keep `purchase` rows for `Transaction Type`, and then remove `Trsansaction Type`
if 'Transaction Type' in df.columns:
    df = df[df['Transaction Type'] == 'purchase']
    df.drop('Transaction Type', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 975


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [6]:
# calculate consumer age, any birth year after 2020 is converted to null, and then remove `Consumer Birth Year` column
if 'Consumer Birth Year' in df.columns:
    df['Age'] = df['Consumer Birth Year'].apply(lambda x: 2020 - int(x) if int(x) < 2020 else None)
    df.drop('Consumer Birth Year', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)  

Row # of the table: 975


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [7]:
# convert `N\A` in `Transation date` into null
df['Transaction Date'].replace({"N\A":None}, inplace=True)
# convert `both` in `Consumer Gender` into null, only keep male and female
df['Consumer Gender'].replace({'both':None}, inplace=True)
# convert `investment_account` and `loans` in `Account Type` into null, only keep bank_account and credit_card
df['Account Type'].replace({'investment_account':None,'loans':None},inplace=True)

In [8]:
# check missing values 
missing_df = df.isnull().sum().reset_index()
missing_df.columns = ['variable', 'missing counts']
missing_df['missing per (%)'] = (missing_df['missing counts'])/df.shape[0]*100
missing_df.sort_values('missing per (%)',ascending = False).reset_index(drop = True)

Unnamed: 0,variable,missing counts,missing per (%)
0,Consumer Gender,5,0.512821
1,Age,3,0.307692
2,Transaction Date,2,0.205128
3,Account Type,0,0.0
4,Normalized Retailer,0,0.0
5,SIC Description,0,0.0
6,Purchase Amount,0,0.0


In [9]:
# remove missing values above
df.dropna(inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 965


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [10]:
# convert ‘Transaction Date’ into day_of_week (Mon/Tue.) and period_of_month (start, mid and end).
if 'Transaction Date' in df.columns:
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
    df['day_of_week'] = df['Transaction Date'].dt.day_name()
    df['day_of_month'] = df['Transaction Date'].dt.day
    df['period_of_month'] = df.apply(lambda x: 'start' if x.day_of_month <= 10 else 'mid' if x.day_of_month <=20 else 'end', axis = 1)
    df.drop(['Transaction Date','day_of_month'], axis = 1, inplace = True)

In [11]:
df.head()

Unnamed: 0,Account Type,Consumer Gender,Normalized Retailer,SIC Description,Purchase Amount,Age,day_of_week,period_of_month
0,credit_card,male,Red Robin,Eating Places,15.52,22.0,Monday,start
1,bank_account,male,California Thai,Eating Places,11.29,30.0,Monday,end
2,credit_card,male,Petro-Canada,Gasoline Service Stations,10.78,34.0,Monday,end
3,bank_account,female,The Beer Store,Liquor Stores,88.9,43.0,Monday,start
4,bank_account,male,Intermarche,Grocery Stores,23.54,62.0,Sunday,start


In [12]:
# function to convert day_of_week and period_of_month back to Date, given a year and a month
# year = 2019; month = 7; day_of_week = 'Monday'; period_of_month = 'start'

In [13]:
# 'SIC Description' (114) - only keep top N and group the rest into `other`
N = 9
def viewSICCounts(df):
    df_pivot = df.groupby(by = 'SIC Description').size().reset_index(name='Counts')
    df_pivot['Per (%)'] = (df_pivot['Counts'])/df.shape[0]*100
    df_pivot.sort_values(by = 'Counts',ascending = False,inplace = True)
    return df_pivot
    
df_pivot = viewSICCounts(df)
list2keep = list(df_pivot.nlargest(N, 'Counts')['SIC Description'])
print("SIC to keep: ", list2keep)

SIC to keep:  ['Eating Places', 'Grocery Stores', 'Gasoline Service Stations', 'Drug Stores and Proprietary Stores', 'Book Stores', 'Family Clothing Stores', 'Taxicabs', 'Radiotelephone Communications', 'Communications Services, Not Elsewhere Classified']


In [14]:
df['SIC Description'] = df['SIC Description'].apply(lambda x: x if x in list2keep else 'Other')
df_pivot = viewSICCounts(df)
df_pivot.head(N+1)

Unnamed: 0,SIC Description,Counts,Per (%)
3,Eating Places,276,28.601036
7,Other,203,21.036269
6,Grocery Stores,189,19.585492
5,Gasoline Service Stations,108,11.19171
2,Drug Stores and Proprietary Stores,47,4.870466
0,Book Stores,44,4.559585
4,Family Clothing Stores,31,3.212435
9,Taxicabs,24,2.487047
8,Radiotelephone Communications,22,2.279793
1,"Communications Services, Not Elsewhere Classified",21,2.176166


In [15]:
# 'Normalized Retailer' (2449) - 20 dimensions embedding
model = Word2Vec.load('models/perSICperPerson.model')
retailerVec = model.wv[df['Normalized Retailer']]
retailerVec.shape

(965, 20)