This is a test file running on local machine with only 100 records of data. The full dataset is run on Google Colab.

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sdgym.synthesizers import TVAESynthesizer
import datetime
import pickle

In [2]:
# load data
df = pd.read_csv('data/cc_data.csv')
df = df[:100]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# remove unnecessary columns
col2remove = ['SIC Code', 'Return Amount', 'Reward Amount', 'Transaction ID', 
              'Account Identifier', 'Account Name', 'Account Number', 'Bank Name', 
              'Aggregator Name', 'Consumer ID', 'Consumer Created Date',
              'Transaction String', 'Posted Date', 'Data Creation Date', 
              'Consumer Postal Code', 'Consumer City Name','Ethnicity']
df.drop(col2remove, axis = 1, inplace = True, errors='ignore') # errors option make the columns drop only when exists

In [4]:
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 100


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Transaction Type',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [5]:
# Only keep `purchase` rows for `Transaction Type`, and then remove `Trsansaction Type`
if 'Transaction Type' in df.columns:
    df = df[df['Transaction Type'] == 'purchase']
    df.drop('Transaction Type', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 95


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [6]:
# calculate consumer age, any birth year after 2020 is converted to null, and then remove `Consumer Birth Year` column
if 'Consumer Birth Year' in df.columns:
    df['Age'] = df['Consumer Birth Year'].apply(lambda x: 2020 - int(x) if int(x) < 2020 else None)
    df.drop('Consumer Birth Year', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)  

Row # of the table: 95


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [7]:
# convert `N\A` in `Transation date` into null
df['Transaction Date'].replace({"N\A":None}, inplace=True)
# convert `both` in `Consumer Gender` into null, only keep male and female
df['Consumer Gender'].replace({'both':None}, inplace=True)
# convert `investment_account` and `loans` in `Account Type` into null, only keep bank_account and credit_card
df['Account Type'].replace({'investment_account':None,'loans':None},inplace=True)

In [8]:
# check missing values 
missing_df = df.isnull().sum().reset_index()
missing_df.columns = ['variable', 'missing counts']
missing_df['missing per (%)'] = (missing_df['missing counts'])/df.shape[0]*100
missing_df.sort_values('missing per (%)',ascending = False).reset_index(drop = True)

Unnamed: 0,variable,missing counts,missing per (%)
0,Consumer Gender,1,1.052632
1,Account Type,0,0.0
2,Normalized Retailer,0,0.0
3,SIC Description,0,0.0
4,Purchase Amount,0,0.0
5,Transaction Date,0,0.0
6,Age,0,0.0


In [9]:
# remove missing values above
df.dropna(inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 94


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [10]:
# convert ‘Transaction Date’ into day_of_week (Mon/Tue.) and period_of_month (start, mid and end).
if 'Transaction Date' in df.columns:
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
    df['day_of_week'] = df['Transaction Date'].dt.day_name()
    df['day_of_month'] = df['Transaction Date'].dt.day
    df['period_of_month'] = df.apply(lambda x: 'start' if x.day_of_month <= 10 else 'mid' if x.day_of_month <=20 else 'end', axis = 1)
    df.drop(['Transaction Date','day_of_month'], axis = 1, inplace = True)

In [11]:
df.head()

Unnamed: 0,Account Type,Consumer Gender,Normalized Retailer,SIC Description,Purchase Amount,Age,day_of_week,period_of_month
0,credit_card,male,Red Robin,Eating Places,15.52,22,Monday,start
1,bank_account,male,California Thai,Eating Places,11.29,30,Monday,end
2,credit_card,male,Petro-Canada,Gasoline Service Stations,10.78,34,Monday,end
3,bank_account,female,The Beer Store,Liquor Stores,88.9,43,Monday,start
4,bank_account,male,Intermarche,Grocery Stores,23.54,62,Sunday,start


In [13]:
# 'SIC Description' (114) - only keep top N and group the rest into `other`
N = 9
def viewSICCounts(df):
    df_pivot = df.groupby(by = 'SIC Description').size().reset_index(name='Counts')
    df_pivot['Per (%)'] = (df_pivot['Counts'])/df.shape[0]*100
    df_pivot.sort_values(by = 'Counts',ascending = False,inplace = True)
    return df_pivot
    
df_pivot = viewSICCounts(df)
list2keep = list(df_pivot.nlargest(N, 'Counts')['SIC Description'])
print("SIC to keep: ", list2keep)

SIC to keep:  ['Eating Places', 'Gasoline Service Stations', 'Grocery Stores', 'Drug Stores and Proprietary Stores', 'Miscellaneous Food Stores', 'Hardware Stores', 'Family Clothing Stores', 'Book Stores', 'Communications Services, Not Elsewhere Classified']


In [14]:
df['SIC Description'] = df['SIC Description'].apply(lambda x: x if x in list2keep else 'Other')
df_pivot = viewSICCounts(df)
df_pivot.head(N+1)

Unnamed: 0,SIC Description,Counts,Per (%)
3,Eating Places,24,25.531915
9,Other,18,19.148936
5,Gasoline Service Stations,13,13.829787
6,Grocery Stores,12,12.765957
2,Drug Stores and Proprietary Stores,7,7.446809
0,Book Stores,4,4.255319
1,"Communications Services, Not Elsewhere Classified",4,4.255319
4,Family Clothing Stores,4,4.255319
7,Hardware Stores,4,4.255319
8,Miscellaneous Food Stores,4,4.255319


In [15]:
# 'Normalized Retailer' (2449) - 20 dimensions embedding
model = Word2Vec.load('models/perSICperPerson.model')

# remove records with minority retailers (the dictionary only keep retailer that appears at least 5 times)
#df2plot_topN = df2plot[df2plot['SIC'].isin(list2plot)]
df = df[df['Normalized Retailer'].isin(list(model.wv.vocab))]

retailerVec = model.wv[df['Normalized Retailer']]
print(retailerVec.shape)

(94, 20)


In [16]:
# convert retailer vector array into dataframe
df_retailerVec = pd.DataFrame(retailerVec, columns=["retailerVec_%02d" % x for x in range(1,21)]) 
print(df_retailerVec.shape)
df_retailerVec.head()

(94, 20)


Unnamed: 0,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,0.2078,-0.27456,-0.350884,-1.0994,0.518235,0.750903,-0.509509,0.389361,0.442397,0.117832,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,0.357645,-0.446232,0.187722,-0.985323,-0.203321,0.599157,-0.409267,-0.037081,-0.133219,1.5468,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,-1.060268,0.038343,0.020715,-0.924681,1.332455,1.47236,-0.160993,0.042216,-1.751077,-0.167724,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045
3,-2.5299,-1.939256,2.024107,-0.547954,0.5095,0.067465,-0.182666,0.026708,-0.85606,2.101601,-0.83034,-1.716847,-0.924528,-2.19206,-0.568949,-0.394575,0.77141,3.295305,-2.517382,-0.529754
4,-0.606716,0.809018,0.913862,-0.044498,0.056555,0.068848,1.614883,0.032399,-1.851361,-0.825765,-0.48682,-0.150212,-0.089762,-1.558538,1.121333,1.728101,0.056044,0.206816,-1.090035,1.058926


In [17]:
# one hot encoding for categorical columns except `Normalized Retailer`
df_dummy = df.copy()
df_dummy.drop('Normalized Retailer', axis = 1, inplace = True, errors='ignore')
df_dummy = pd.get_dummies(df_dummy)
print(df_dummy.shape)
df_dummy.head()

(94, 26)


Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,period_of_month_end,period_of_month_mid,period_of_month_start
0,15.52,22,0,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,11.29,30,1,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
2,10.78,34,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,88.9,43,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,23.54,62,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [18]:
# concatenate df_dummy and df_retailerVec
df_dummy.reset_index(inplace=True,drop=True)
df_retailerVec.reset_index(inplace=True,drop=True)

df_input = pd.concat([df_dummy, df_retailerVec], axis = 1, sort = False, ignore_index = False)
print(df_input.shape)
df_input.head()

(94, 46)


Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,15.52,22,0,1,0,1,0,0,0,1,...,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,11.29,30,1,0,0,1,0,0,0,1,...,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,10.78,34,0,1,0,1,0,0,0,0,...,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045
3,88.9,43,1,0,1,0,0,0,0,0,...,-0.83034,-1.716847,-0.924528,-2.19206,-0.568949,-0.394575,0.77141,3.295305,-2.517382,-0.529754
4,23.54,62,1,0,0,1,0,0,0,0,...,-0.48682,-0.150212,-0.089762,-1.558538,1.121333,1.728101,0.056044,0.206816,-1.090035,1.058926


In [19]:
# convert pd frame to np array and indicate categorical and oridinal columns
data = df_input.to_numpy()
categorical_columns = [x for x in range(2,46)]
ordinal_columns = [1]

In [20]:
# train the synthesizer
start = datetime.datetime.now()

synthesizer = TVAESynthesizer()
synthesizer.fit(data, categorical_columns, ordinal_columns)

print("TVAE training time: " + str(datetime.datetime.now()-start))

TVAE training time: 0:00:00.279252


In [21]:
# save the synthesizer
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # overwrite any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

#save_object(synthesizer, 'models/TVAE_synthesizer.pkl')

In [22]:
# load synthesizer from saved object
with open('models/TVAE_synthesizer.pkl', 'rb') as input:
    synthesizer = pickle.load(input)

In [38]:
# check out sample
sampled = synthesizer.sample(1)
np.set_printoptions(suppress = True, precision = 2)
print(sampled)

[[57.38 49.    0.    1.    1.    0.    1.    0.    1.    1.    1.    0.
   0.    0.    1.    1.    1.    0.    0.    0.    1.    0.    0.    1.
   1.    1.    0.9  -1.05  0.03 -1.61  0.01  0.57  1.26 -0.27 -0.18 -0.89
  -0.14  0.28 -0.82 -2.39 -0.22  0.05  0.77  0.02 -0.22  1.25]]


In [41]:
# convert sample back to readable categories
sample = data[0:3]
print(sample)

[[15.52 22.    0.    1.    0.    1.    0.    0.    0.    1.    0.    0.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    0.
   0.    1.    0.21 -0.27 -0.35 -1.1   0.52  0.75 -0.51  0.39  0.44  0.12
   0.7   1.11  0.03 -1.07  0.05  0.01 -0.19 -0.13 -0.97  0.69]
 [11.29 30.    1.    0.    0.    1.    0.    0.    0.    1.    0.    0.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.    0.36 -0.45  0.19 -0.99 -0.2   0.6  -0.41 -0.04 -0.13  1.55
  -0.14  0.28  0.91 -0.16 -0.58  1.11  0.59 -0.15 -0.48  0.62]
 [10.78 34.    0.    1.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.   -1.06  0.04  0.02 -0.92  1.33  1.47 -0.16  0.04 -1.75 -0.17
   0.13 -0.19 -1.72 -0.74 -0.58  2.01 -0.02  0.68 -2.56 -1.05]]


In [42]:
df_sample = pd.DataFrame(sample, columns=list(df_input.columns)) 
print(df_sample.columns)
df_sample.head()

Index(['Purchase Amount', 'Age', 'Account Type_bank_account',
       'Account Type_credit_card', 'Consumer Gender_female',
       'Consumer Gender_male', 'SIC Description_Book Stores',
       'SIC Description_Communications Services, Not Elsewhere Classified',
       'SIC Description_Drug Stores and Proprietary Stores',
       'SIC Description_Eating Places',
       'SIC Description_Family Clothing Stores',
       'SIC Description_Gasoline Service Stations',
       'SIC Description_Grocery Stores', 'SIC Description_Hardware Stores',
       'SIC Description_Miscellaneous Food Stores', 'SIC Description_Other',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'period_of_month_end', 'period_of_month_mid',
       'period_of_month_start', 'retailerVec_01', 'retailerVec_02',
       'retailerVec_03', 'retailerVec_04', 'retailerVec_05', 'retailerVec_06',
       '

Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,15.52,22.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,11.29,30.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,10.78,34.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045


In [74]:
tmp = df_sample.iloc[:, 2:4]
df_ac = pd.Series(tmp.columns[np.where(tmp!=0)[1]], name = 'Account Type')
df_ac = df_ac.map(lambda x: x.replace('Account Type_',''))
df_ac.head()

0     credit_card
1    bank_account
2     credit_card
Name: Account Type, dtype: object

In [76]:
tmp = df_sample.iloc[:, 4:6]
df_ge = pd.Series(tmp.columns[np.where(tmp!=0)[1]], name = 'Consumer Gender')
df_ge = df_ge.map(lambda x: x.replace('Consumer Gender_',''))
df_ge.head()

0    male
1    male
2    male
Name: Consumer Gender, dtype: object

In [78]:
test = pd.concat([df_ac,df_ge], axis=1)
test.head()

Unnamed: 0,Account Type,Consumer Gender
0,credit_card,male
1,bank_account,male
2,credit_card,male


In [12]:
# function to convert day_of_week and period_of_month back to Date, given a year and a month
# year = 2019; month = 7; day_of_week = 'Monday'; period_of_month = 'start'