This is a test file running on local machine with only 100 records of data. The full dataset is run on Google Colab.

In [26]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sdgym.synthesizers import TVAESynthesizer
import datetime
import pickle
import random

In [27]:
# load data
df = pd.read_csv('data/cc_data.csv')
#df = df.sample(n = 1000, random_state = 1)
#df.reset_index(drop = True, inplace = True)

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
# remove unnecessary columns
col2remove = ['SIC Code', 'Return Amount', 'Reward Amount', 'Transaction ID', 
              'Account Identifier', 'Account Name', 'Account Number', 'Bank Name', 
              'Aggregator Name', 'Consumer ID', 'Consumer Created Date',
              'Transaction String', 'Posted Date', 'Data Creation Date', 
              'Consumer Postal Code', 'Consumer City Name','Ethnicity']
df.drop(col2remove, axis = 1, inplace = True, errors='ignore') # errors option make the columns drop only when exists

In [29]:
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 3615245


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Transaction Type',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [30]:
# Only keep `purchase` rows for `Transaction Type`, and then remove `Trsansaction Type`
if 'Transaction Type' in df.columns:
    df = df[df['Transaction Type'] == 'purchase']
    df.drop('Transaction Type', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 3534545


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [31]:
# calculate consumer age, any birth year after 2020 is converted to null, and then remove `Consumer Birth Year` column
if 'Consumer Birth Year' in df.columns:
    df['Age'] = df['Consumer Birth Year'].apply(lambda x: 2020 - int(x) if int(x) < 2020 else None)
    df.drop('Consumer Birth Year', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)  

Row # of the table: 3534545


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [32]:
# convert `N\A` in `Transation date` into null
df['Transaction Date'].replace({"N\A":None}, inplace=True)
# convert `both` in `Consumer Gender` into null, only keep male and female
df['Consumer Gender'].replace({'both':None}, inplace=True)
# convert `investment_account` and `loans` in `Account Type` into null, only keep bank_account and credit_card
df['Account Type'].replace({'investment_account':None,'loans':None},inplace=True)

In [33]:
# check missing values 
missing_df = df.isnull().sum().reset_index()
missing_df.columns = ['variable', 'missing counts']
missing_df['missing per (%)'] = (missing_df['missing counts'])/df.shape[0]*100
missing_df.sort_values('missing per (%)',ascending = False).reset_index(drop = True)

Unnamed: 0,variable,missing counts,missing per (%)
0,Consumer Gender,20436,0.578179
1,Age,17817,0.504082
2,Transaction Date,7109,0.201129
3,Account Type,386,0.010921
4,Normalized Retailer,0,0.0
5,SIC Description,0,0.0
6,Purchase Amount,0,0.0


In [34]:
# remove missing values above
df.dropna(inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 3489326


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [None]:
#df.to_csv('data/cc_data_processed.csv')

In [35]:
# convert ‘Transaction Date’ into day_of_week (Mon/Tue.) and period_of_month (start, mid and end).
if 'Transaction Date' in df.columns:
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
    df['day_of_week'] = df['Transaction Date'].dt.day_name()
    df['day_of_month'] = df['Transaction Date'].dt.day
    df['period_of_month'] = df.apply(lambda x: 'start' if x.day_of_month <= 10 else 'mid' if x.day_of_month <=20 else 'end', axis = 1)
    df.drop(['Transaction Date','day_of_month'], axis = 1, inplace = True)

In [36]:
df.head()

Unnamed: 0,Account Type,Consumer Gender,Normalized Retailer,SIC Description,Purchase Amount,Age,day_of_week,period_of_month
0,credit_card,male,Red Robin,Eating Places,15.52,22.0,Monday,start
1,bank_account,male,California Thai,Eating Places,11.29,30.0,Monday,end
2,credit_card,male,Petro-Canada,Gasoline Service Stations,10.78,34.0,Monday,end
3,bank_account,female,The Beer Store,Liquor Stores,88.9,43.0,Monday,start
4,bank_account,male,Intermarche,Grocery Stores,23.54,62.0,Sunday,start


In [37]:
# bucket age into categorical values every 5 years
bin = list(range(20,80,5))
print(bin)

[20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75]


In [38]:
# use pd.cut function can attribute the values into its specific bins 
age_cat = pd.cut(df.Age, bin).to_frame()
age_cat.columns = ['Age Range'] 
print(age_cat)

if "Age Range" not in df:
    df = pd.concat([df, age_cat],axis = 1)
    
df.drop(['Age'], axis = 1, inplace = True, errors = 'ignore')
df.dropna(inplace = True)

        Age Range
0        (20, 25]
1        (25, 30]
2        (30, 35]
3        (40, 45]
4        (60, 65]
...           ...
3615240  (25, 30]
3615241  (60, 65]
3615242  (35, 40]
3615243  (40, 45]
3615244  (30, 35]

[3489326 rows x 1 columns]


In [39]:
# 'SIC Description' (114) - only keep top N and group the rest into `other`
N = 9
def viewSICCounts(df,col_name):
    df_pivot = df.groupby(by = col_name).size().reset_index(name='Counts')
    df_pivot['Per (%)'] = (df_pivot['Counts'])/df.shape[0]*100
    df_pivot.sort_values(by = 'Counts',ascending = False,inplace = True)
    return df_pivot
    
df_pivot = viewSICCounts(df,'SIC Description')
list2keep = list(df_pivot.nlargest(N, 'Counts')['SIC Description'])
print("SIC to keep: ", list2keep)

SIC to keep:  ['Eating Places', 'Grocery Stores', 'Gasoline Service Stations', 'Book Stores', 'Drug Stores and Proprietary Stores', 'Miscellaneous Food Stores', 'Family Clothing Stores', 'Taxicabs', 'Communications Services, Not Elsewhere Classified']


In [40]:
df['SIC Description'] = df['SIC Description'].apply(lambda x: x if x in list2keep else 'Other')
df_pivot = viewSICCounts(df,'SIC Description')
df_pivot.head(N+1)

Unnamed: 0,SIC Description,Counts,Per (%)
3,Eating Places,1055248,30.872618
6,Grocery Stores,685772,20.063129
8,Other,652486,19.089305
5,Gasoline Service Stations,356695,10.435564
0,Book Stores,174882,5.116395
2,Drug Stores and Proprietary Stores,165648,4.846242
7,Miscellaneous Food Stores,101529,2.97036
4,Family Clothing Stores,90386,2.644357
9,Taxicabs,72931,2.133689
1,"Communications Services, Not Elsewhere Classified",62494,1.828341


In [41]:
# 'Normalized Retailer' (2449) - 20 dimensions embedding
model = Word2Vec.load('models/perSICperPerson_10emb.model')

# remove records with minority retailers (the dictionary only keep retailer that appears at least 5 times)
#df2plot_topN = df2plot[df2plot['SIC'].isin(list2plot)]
df = df[df['Normalized Retailer'].isin(list(model.wv.vocab))]

retailerVec = model.wv[df['Normalized Retailer']]
print(retailerVec.shape)

(3416377, 10)


In [42]:
# convert retailer vector array into dataframe
df_retailerVec = pd.DataFrame(retailerVec, columns=["retailerVec_%02d" % x for x in range(1,(retailerVec.shape[1])+1)]) 
print(df_retailerVec.shape)
df_retailerVec.head()

(3416377, 10)


Unnamed: 0,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10
0,1.411963,-0.549346,0.365458,1.269856,0.210976,0.038479,-0.653121,0.188011,-1.166497,0.239776
1,0.508452,-1.069697,0.593561,0.035531,-0.84421,0.809845,-0.278989,-0.422637,-1.637115,-0.360624
2,1.570207,-0.383605,-1.320911,2.027106,-0.79294,-1.965527,-0.41895,-2.403353,-1.935519,0.071744
3,0.339553,-2.429187,0.633265,2.140538,1.151282,-0.408895,-0.097785,-3.707122,-3.125121,-0.018113
4,-0.380342,-0.296849,-0.935541,-0.371411,0.178476,-1.70638,-0.077011,-0.780747,-2.339961,-2.085466


In [43]:
# one hot encoding for categorical columns except `Normalized Retailer`
df_dummy = df.copy()
df_dummy.drop('Normalized Retailer', axis = 1, inplace = True, errors='ignore')
df_dummy = pd.get_dummies(df_dummy)
print(df_dummy.shape)
df_dummy.head()

(3416377, 36)


Unnamed: 0,Purchase Amount,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,SIC Description_Family Clothing Stores,...,"Age Range_(25, 30]","Age Range_(30, 35]","Age Range_(35, 40]","Age Range_(40, 45]","Age Range_(45, 50]","Age Range_(50, 55]","Age Range_(55, 60]","Age Range_(60, 65]","Age Range_(65, 70]","Age Range_(70, 75]"
0,15.52,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,11.29,1,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,10.78,0,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,88.9,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,23.54,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [44]:
# concatenate df_dummy and df_retailerVec
df_dummy.reset_index(inplace=True,drop=True)
df_retailerVec.reset_index(inplace=True,drop=True)
df_input = pd.concat([df_dummy, df_retailerVec], axis = 1, sort = False, ignore_index = False)

In [45]:
#df_input.shape
#df_input['day_of_week_Tuesday'].value_counts()
#df_input['period_of_month_end'].value_counts()

In [46]:
#df_input.to_csv('data/cc_data_input_10emb_ageCat75.csv')

In [47]:
#df_input = df_input.sample(n = 500)
print(df_input.shape)
df_input.head()

(3416377, 46)


Unnamed: 0,Purchase Amount,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,SIC Description_Family Clothing Stores,...,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10
0,15.52,0,1,0,1,0,0,0,1,0,...,1.411963,-0.549346,0.365458,1.269856,0.210976,0.038479,-0.653121,0.188011,-1.166497,0.239776
1,11.29,1,0,0,1,0,0,0,1,0,...,0.508452,-1.069697,0.593561,0.035531,-0.84421,0.809845,-0.278989,-0.422637,-1.637115,-0.360624
2,10.78,0,1,0,1,0,0,0,0,0,...,1.570207,-0.383605,-1.320911,2.027106,-0.79294,-1.965527,-0.41895,-2.403353,-1.935519,0.071744
3,88.9,1,0,1,0,0,0,0,0,0,...,0.339553,-2.429187,0.633265,2.140538,1.151282,-0.408895,-0.097785,-3.707122,-3.125121,-0.018113
4,23.54,1,0,0,1,0,0,0,0,0,...,-0.380342,-0.296849,-0.935541,-0.371411,0.178476,-1.70638,-0.077011,-0.780747,-2.339961,-2.085466


In [48]:
df_input.columns

Index(['Purchase Amount', 'Account Type_bank_account',
       'Account Type_credit_card', 'Consumer Gender_female',
       'Consumer Gender_male', 'SIC Description_Book Stores',
       'SIC Description_Communications Services, Not Elsewhere Classified',
       'SIC Description_Drug Stores and Proprietary Stores',
       'SIC Description_Eating Places',
       'SIC Description_Family Clothing Stores',
       'SIC Description_Gasoline Service Stations',
       'SIC Description_Grocery Stores',
       'SIC Description_Miscellaneous Food Stores', 'SIC Description_Other',
       'SIC Description_Taxicabs', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday', 'period_of_month_end',
       'period_of_month_mid', 'period_of_month_start', 'Age Range_(20, 25]',
       'Age Range_(25, 30]', 'Age Range_(30, 35]', 'Age Range_(35, 40]',
       'Age Range_(40, 45]', 'Age Range_(4

In [24]:
# convert pd frame to np array and indicate categorical and oridinal columns
df_input_sample = df_input.sample(n=5000)
data = df_input_sample.to_numpy()
#categorical_columns = [x for x in range(25,39)]
#ordinal_columns = [1]
#print(categorical_columns)

In [185]:
# train the synthesizer
start = datetime.datetime.now()

synthesizer = TVAESynthesizer()
synthesizer.fit(data)
#synthesizer.fit(data,categorical_columns)
#synthesizer.fit(data, categorical_columns, ordinal_columns)

print("TVAE training time: " + str(datetime.datetime.now()-start))

  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_


  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_




TVAE training time: 0:04:56.981373


In [186]:
# save the synthesizer
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # overwrite any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

#save_object(synthesizer, 'models/TVAE_synthesizer_test.pkl')

In [25]:
# load synthesizer from saved object
#with open('models/TVAE_synthesizer_test.pkl', 'rb') as input:
#    synthesizer = pickle.load(input)

In [157]:
# check out sample
sampled = synthesizer.sample(2)
np.set_printoptions(suppress = True, precision = 5)
print(sampled)

[[16.60225  0.00066  0.99923  0.9985   0.00107 -0.00048 -0.00001  0.00026
   0.00041 -0.00009 -0.00031  0.00042  0.00019 -0.00063  0.00028  0.00113
  -0.00027  0.00057  0.00009  0.00112  0.00086  0.00002  0.00124  0.00169
  -0.00006  0.00099  0.00044  0.00016  0.00039 -0.00063 -0.00029 -0.00075
  -0.00016  0.0002   0.00044 -0.00008 -0.00045  0.02342 -0.6636  -0.56721
   0.37705 -0.43591 -2.48369 -0.04504 -2.46961 -2.33033 -1.37893]
 [10.53475  0.99788  0.00175  0.9985   0.00038  0.00028 -0.00019  0.00027
   0.00126  0.0003   0.00008  0.00115  0.00037 -0.00036  0.00013  0.00072
   0.00002 -0.00012 -0.      -0.00036 -0.00027  0.00004  0.00034  0.00177
   0.00141  0.00012 -0.00025  0.00073  0.00096 -0.00013  0.00022  0.00023
  -0.00045  0.00052  0.0001  -0.00002 -0.00007  0.68505 -0.83289  0.43912
   0.56328 -0.57185  0.18821 -0.27283 -0.41907 -0.49189  0.31846]]


In [78]:
# convert sample back to readable categories
sample = data[0:3]
print(sample)

[[115.23      1.        0.        1.        0.        0.        0.
    0.        0.        0.        0.        0.        0.        1.
    0.        0.        0.        0.        1.        0.        0.
    0.        1.        0.        0.        0.        0.        0.
    0.        0.        1.        0.        0.        0.        0.
    0.        0.        0.        0.        2.03871  -0.07473   2.1519
   -0.98602  -1.24199  -1.65323  -0.86328  -4.63782   0.86199  -0.33207]
 [ 52.07      0.        1.        1.        0.        0.        0.
    0.        0.        0.        0.        1.        0.        0.
    0.        1.        0.        0.        0.        0.        0.
    0.        1.        0.        0.        0.        0.        0.
    0.        0.        0.        0.        0.        0.        1.
    0.        0.        0.        0.       -0.09776  -0.87034  -1.47865
    0.40115  -0.33904  -2.61377  -0.93507  -0.77392  -0.68241   0.07708]
 [ 10.78      1.        0.        0.     

In [28]:
df_sample = pd.DataFrame(sample, columns=list(df_input.columns)) 
print(df_sample.columns)
df_sample.head()

Index(['Purchase Amount', 'Age', 'Account Type_bank_account',
       'Account Type_credit_card', 'Consumer Gender_female',
       'Consumer Gender_male', 'SIC Description_Book Stores',
       'SIC Description_Communications Services, Not Elsewhere Classified',
       'SIC Description_Department Stores',
       'SIC Description_Drug Stores and Proprietary Stores',
       'SIC Description_Eating Places',
       'SIC Description_Gasoline Service Stations',
       'SIC Description_Grocery Stores',
       'SIC Description_Miscellaneous Food Stores', 'SIC Description_Other',
       'SIC Description_Taxicabs', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday', 'period_of_month_end',
       'period_of_month_mid', 'period_of_month_start', 'retailerVec_01',
       'retailerVec_02', 'retailerVec_03', 'retailerVec_04', 'retailerVec_05',
       'retailerVec_06', 'retailerVec_

Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Department Stores,SIC Description_Drug Stores and Proprietary Stores,...,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10
0,115.23,41.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.03871,-0.07473,2.151897,-0.986016,-1.241991,-1.653229,-0.863275,-4.637819,0.861988,-0.332074
1,52.07,62.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.097765,-0.870343,-1.478647,0.401151,-0.339043,-2.613766,-0.935073,-0.773921,-0.682407,0.077079
2,10.78,59.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.244991,-0.432441,-0.757661,-0.632093,-0.452941,-2.614851,-0.341926,-1.246478,-1.085705,-0.772848


In [29]:
# return series of dummy variables with given column name
def reverse_dummy(df, col_name):
    # get index of columns that starts with col_name, for example, Gender_male, Gender_female for col_name = `Gender`
    idx = [i for i, s in enumerate(list(df.columns)) if col_name in s]
    tmp = df.iloc[:,idx]
    # convert selected dataframe to series (only works well when only one 1 in each row)
    df_output = pd.Series(tmp.columns[np.where(tmp!=0)[1]], name = col_name)
    # remove strings with col_name plus underscore
    df_output = df_output.map(lambda x: x.replace(col_name + '_',''))
    return df_output

In [30]:
df_num = df_sample.iloc[:,[0,1]]
df_account = reverse_dummy(df_sample,'Account Type')
df_gender = reverse_dummy(df_sample,'Consumer Gender')
df_SIC = reverse_dummy(df_sample,'SIC Description')
df_dw = reverse_dummy(df_sample,'day_of_week')
df_pm = reverse_dummy(df_sample,'period_of_month')

In [31]:
df_sample.loc[1,'Account Type_bank_account']= 0
df_sample

Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Department Stores,SIC Description_Drug Stores and Proprietary Stores,...,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10
0,115.23,41.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.03871,-0.07473,2.151897,-0.986016,-1.241991,-1.653229,-0.863275,-4.637819,0.861988,-0.332074
1,52.07,62.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.097765,-0.870343,-1.478647,0.401151,-0.339043,-2.613766,-0.935073,-0.773921,-0.682407,0.077079
2,10.78,59.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.244991,-0.432441,-0.757661,-0.632093,-0.452941,-2.614851,-0.341926,-1.246478,-1.085705,-0.772848


In [32]:
df_account

0    bank_account
1     credit_card
2    bank_account
Name: Account Type, dtype: object

In [33]:
df_reverse = pd.concat([df_num,df_account,df_gender,df_SIC,df_dw,df_pm], axis=1)
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,day_of_week,period_of_month
0,115.23,41.0,bank_account,female,Other,Sunday,end
1,52.07,62.0,credit_card,female,Grocery Stores,Friday,end
2,10.78,59.0,bank_account,male,Grocery Stores,Tuesday,start


In [34]:
# given period_of_month (start, mid or end) and year (1989) and month (6), return index of the possible days
def return_day_index(period_of_month, year, month):
    if period_of_month == 'start':
        return list(range(1,11))
    elif period_of_month == 'mid':
        return list(range(11,21))
    else:
        return list(range(21, pd.Period(str(year) + '-' + str(month)).days_in_month + 1))

In [35]:
# function to convert day_of_week and period_of_month back to Date, given a year and a month
# Example:
# input: day_of_week = 'Monday'; period_of_month = 'start'; Y = 2020; M = 2
# output: 2020-02-03 00:00:00

def return_date(day_of_week, period_of_month, Y, M):
    # return list of days
    D = return_day_index(period_of_month, Y, M)

    tmp = pd.DataFrame({'year': [str(Y) for i in range(len(D))],
                        'month': [str(M) for i in range(len(D))],
                        'day': D})
    
    # create table with each row of year, month and day in given period
    date_period = pd.to_datetime(tmp[['year', 'month', 'day']])

    # locate index of which date is the given day_of_week
    idx_list = [i for i, s in enumerate(date_period.dt.strftime('%A')) if day_of_week in s]

    # randomly pick up one index, since it's possible one period has multiple given weekday (say Monday)
    idx = random.choice(idx_list)

    return date_period[idx]

In [36]:
Y = 2022; M = 2
df_reverse['Date'] = df_reverse.apply(lambda x: return_date(x.day_of_week, x.period_of_month, Y, M), axis = 1)
df_reverse.drop(['day_of_week','period_of_month'], axis = 1, inplace = True, errors = 'ignore')
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,Date
0,115.23,41.0,bank_account,female,Other,2022-02-27
1,52.07,62.0,credit_card,female,Grocery Stores,2022-02-25
2,10.78,59.0,bank_account,male,Grocery Stores,2022-02-01


In [37]:
# reverse retailerVec back to retailers
idx = [i for i, s in enumerate(list(df_sample.columns)) if 'retailerVec' in s]
df_retailerVec = df_sample.iloc[:,idx]
df_retailerVec.head()

Unnamed: 0,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10
0,2.03871,-0.07473,2.151897,-0.986016,-1.241991,-1.653229,-0.863275,-4.637819,0.861988,-0.332074
1,-0.097765,-0.870343,-1.478647,0.401151,-0.339043,-2.613766,-0.935073,-0.773921,-0.682407,0.077079
2,-0.244991,-0.432441,-0.757661,-0.632093,-0.452941,-2.614851,-0.341926,-1.246478,-1.085705,-0.772848


In [38]:
## find out retailer with the most similar vector
#df_reverse['Retailer'] = df_retailerVec.apply(lambda x: model.similar_by_vector(x.to_numpy(),topn = 1)[0][0], axis = 1)
#df_reverse.head()

In [39]:
# load the retailer_map
with open('models/retailer_map.pkl', 'rb') as input:
    retailer_map = pickle.load(input)    
    
# only keep values in the model (more than 5 times appearance)
for key, value in retailer_map.items():
    retailer_map[key] = list(set(value) & set(model.wv.vocab))
    
# group other SIC (after top N) into other
other_list = []    
for key, value in retailer_map.items():
    if key not in list2keep:
        other_list = other_list + value

# delete other group key-values in retailer_map
retailer_map_grouped = {k: retailer_map[k] for k in list2keep}

# add other key-value pairs
retailer_map_grouped['Other'] = other_list

In [40]:
# put retailer vectors into one column
df_retailerVec['retailerVec']= df_retailerVec.values.tolist()
df_SIC_vector = pd.concat([df_SIC,df_retailerVec['retailerVec']],axis = 1)
df_SIC_vector.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,SIC Description,retailerVec
0,Other,"[2.038710355758667, -0.07472971826791763, 2.15..."
1,Grocery Stores,"[-0.09776465594768524, -0.8703432083129883, -1..."
2,Grocery Stores,"[-0.2449909895658493, -0.4324410855770111, -0...."


In [41]:
# find out retailer in the same SIC and also with the most similar vector
def return_retailer_SIC(model, retailer_map_grouped, sector, vector):
    word_list = retailer_map_grouped[sector]
    min_idx = model.wv.distances(vector, other_words = word_list).argmin()
    return word_list[min_idx]

In [42]:
df_reverse['Normalized Retailer'] = df_SIC_vector.apply(lambda x: return_retailer_SIC(model, retailer_map_grouped, x['SIC Description'],x['retailerVec']), axis = 1)
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,Date,Normalized Retailer
0,115.23,41.0,bank_account,female,Other,2022-02-27,Sport-Chek International
1,52.07,62.0,credit_card,female,Grocery Stores,2022-02-25,Food Basics
2,10.78,59.0,bank_account,male,Grocery Stores,2022-02-01,Metro
