This is a test file running on local machine with only 100 records of data. The full dataset is run on Google Colab.

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sdgym.synthesizers import TVAESynthesizer
import datetime
import pickle
import random

In [2]:
# load data
df = pd.read_csv('data/cc_data.csv')
df = df[:1000]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# remove unnecessary columns
col2remove = ['SIC Code', 'Return Amount', 'Reward Amount', 'Transaction ID', 
              'Account Identifier', 'Account Name', 'Account Number', 'Bank Name', 
              'Aggregator Name', 'Consumer ID', 'Consumer Created Date',
              'Transaction String', 'Posted Date', 'Data Creation Date', 
              'Consumer Postal Code', 'Consumer City Name','Ethnicity']
df.drop(col2remove, axis = 1, inplace = True, errors='ignore') # errors option make the columns drop only when exists

In [4]:
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 1000


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Transaction Type',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [5]:
# Only keep `purchase` rows for `Transaction Type`, and then remove `Trsansaction Type`
if 'Transaction Type' in df.columns:
    df = df[df['Transaction Type'] == 'purchase']
    df.drop('Transaction Type', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 975


['Account Type',
 'Consumer Gender',
 'Consumer Birth Year',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date']

In [6]:
# calculate consumer age, any birth year after 2020 is converted to null, and then remove `Consumer Birth Year` column
if 'Consumer Birth Year' in df.columns:
    df['Age'] = df['Consumer Birth Year'].apply(lambda x: 2020 - int(x) if int(x) < 2020 else None)
    df.drop('Consumer Birth Year', axis = 1, inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)  

Row # of the table: 975


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [7]:
# convert `N\A` in `Transation date` into null
df['Transaction Date'].replace({"N\A":None}, inplace=True)
# convert `both` in `Consumer Gender` into null, only keep male and female
df['Consumer Gender'].replace({'both':None}, inplace=True)
# convert `investment_account` and `loans` in `Account Type` into null, only keep bank_account and credit_card
df['Account Type'].replace({'investment_account':None,'loans':None},inplace=True)

In [8]:
# check missing values 
missing_df = df.isnull().sum().reset_index()
missing_df.columns = ['variable', 'missing counts']
missing_df['missing per (%)'] = (missing_df['missing counts'])/df.shape[0]*100
missing_df.sort_values('missing per (%)',ascending = False).reset_index(drop = True)

Unnamed: 0,variable,missing counts,missing per (%)
0,Consumer Gender,5,0.512821
1,Age,3,0.307692
2,Transaction Date,2,0.205128
3,Account Type,0,0.0
4,Normalized Retailer,0,0.0
5,SIC Description,0,0.0
6,Purchase Amount,0,0.0


In [9]:
# remove missing values above
df.dropna(inplace = True)
print("Row # of the table: %d" % len(df.index))
list(df.columns)

Row # of the table: 965


['Account Type',
 'Consumer Gender',
 'Normalized Retailer',
 'SIC Description',
 'Purchase Amount',
 'Transaction Date',
 'Age']

In [10]:
# convert ‘Transaction Date’ into day_of_week (Mon/Tue.) and period_of_month (start, mid and end).
if 'Transaction Date' in df.columns:
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
    df['day_of_week'] = df['Transaction Date'].dt.day_name()
    df['day_of_month'] = df['Transaction Date'].dt.day
    df['period_of_month'] = df.apply(lambda x: 'start' if x.day_of_month <= 10 else 'mid' if x.day_of_month <=20 else 'end', axis = 1)
    df.drop(['Transaction Date','day_of_month'], axis = 1, inplace = True)

In [11]:
df.head()

Unnamed: 0,Account Type,Consumer Gender,Normalized Retailer,SIC Description,Purchase Amount,Age,day_of_week,period_of_month
0,credit_card,male,Red Robin,Eating Places,15.52,22.0,Monday,start
1,bank_account,male,California Thai,Eating Places,11.29,30.0,Monday,end
2,credit_card,male,Petro-Canada,Gasoline Service Stations,10.78,34.0,Monday,end
3,bank_account,female,The Beer Store,Liquor Stores,88.9,43.0,Monday,start
4,bank_account,male,Intermarche,Grocery Stores,23.54,62.0,Sunday,start


In [12]:
# 'SIC Description' (114) - only keep top N and group the rest into `other`
N = 9
def viewSICCounts(df,col_name):
    df_pivot = df.groupby(by = col_name).size().reset_index(name='Counts')
    df_pivot['Per (%)'] = (df_pivot['Counts'])/df.shape[0]*100
    df_pivot.sort_values(by = 'Counts',ascending = False,inplace = True)
    return df_pivot
    
df_pivot = viewSICCounts(df,'SIC Description')
list2keep = list(df_pivot.nlargest(N, 'Counts')['SIC Description'])
print("SIC to keep: ", list2keep)

SIC to keep:  ['Eating Places', 'Grocery Stores', 'Gasoline Service Stations', 'Drug Stores and Proprietary Stores', 'Book Stores', 'Family Clothing Stores', 'Taxicabs', 'Radiotelephone Communications', 'Communications Services, Not Elsewhere Classified']


In [13]:
df['SIC Description'] = df['SIC Description'].apply(lambda x: x if x in list2keep else 'Other')
df_pivot.head(N+1)

Unnamed: 0,SIC Description,Counts,Per (%)
12,Eating Places,276,28.601036
17,Grocery Stores,189,19.585492
15,Gasoline Service Stations,108,11.19171
11,Drug Stores and Proprietary Stores,47,4.870466
1,Book Stores,44,4.559585
13,Family Clothing Stores,31,3.212435
40,Taxicabs,24,2.487047
35,Radiotelephone Communications,22,2.279793
4,"Communications Services, Not Elsewhere Classified",21,2.176166
24,Miscellaneous Food Stores,21,2.176166


In [14]:
# 'Normalized Retailer' (2449) - 20 dimensions embedding
model = Word2Vec.load('models/perSICperPerson.model')

# remove records with minority retailers (the dictionary only keep retailer that appears at least 5 times)
#df2plot_topN = df2plot[df2plot['SIC'].isin(list2plot)]
df = df[df['Normalized Retailer'].isin(list(model.wv.vocab))]

retailerVec = model.wv[df['Normalized Retailer']]
print(retailerVec.shape)

(965, 20)


In [15]:
# convert retailer vector array into dataframe
df_retailerVec = pd.DataFrame(retailerVec, columns=["retailerVec_%02d" % x for x in range(1,21)]) 
print(df_retailerVec.shape)
df_retailerVec.head()

(965, 20)


Unnamed: 0,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,0.2078,-0.27456,-0.350884,-1.0994,0.518235,0.750903,-0.509509,0.389361,0.442397,0.117832,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,0.357645,-0.446232,0.187722,-0.985323,-0.203321,0.599157,-0.409267,-0.037081,-0.133219,1.5468,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,-1.060268,0.038343,0.020715,-0.924681,1.332455,1.47236,-0.160993,0.042216,-1.751077,-0.167724,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045
3,-2.5299,-1.939256,2.024107,-0.547954,0.5095,0.067465,-0.182666,0.026708,-0.85606,2.101601,-0.83034,-1.716847,-0.924528,-2.19206,-0.568949,-0.394575,0.77141,3.295305,-2.517382,-0.529754
4,-0.606716,0.809018,0.913862,-0.044498,0.056555,0.068848,1.614883,0.032399,-1.851361,-0.825765,-0.48682,-0.150212,-0.089762,-1.558538,1.121333,1.728101,0.056044,0.206816,-1.090035,1.058926


In [16]:
# one hot encoding for categorical columns except `Normalized Retailer`
df_dummy = df.copy()
df_dummy.drop('Normalized Retailer', axis = 1, inplace = True, errors='ignore')
df_dummy = pd.get_dummies(df_dummy)
print(df_dummy.shape)
df_dummy.head()

(965, 26)


Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,period_of_month_end,period_of_month_mid,period_of_month_start
0,15.52,22.0,0,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,11.29,30.0,1,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
2,10.78,34.0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,88.9,43.0,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,23.54,62.0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [17]:
# concatenate df_dummy and df_retailerVec
df_dummy.reset_index(inplace=True,drop=True)
df_retailerVec.reset_index(inplace=True,drop=True)
df_input = pd.concat([df_dummy, df_retailerVec], axis = 1, sort = False, ignore_index = False)

In [18]:
#df_input.to_csv('data/cc_data_input.csv')

In [19]:
#df_input = df_input.sample(n = 500)
print(df_input.shape)
df_input.head()

(965, 46)


Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,15.52,22.0,0,1,0,1,0,0,0,1,...,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,11.29,30.0,1,0,0,1,0,0,0,1,...,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,10.78,34.0,0,1,0,1,0,0,0,0,...,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045
3,88.9,43.0,1,0,1,0,0,0,0,0,...,-0.83034,-1.716847,-0.924528,-2.19206,-0.568949,-0.394575,0.77141,3.295305,-2.517382,-0.529754
4,23.54,62.0,1,0,0,1,0,0,0,0,...,-0.48682,-0.150212,-0.089762,-1.558538,1.121333,1.728101,0.056044,0.206816,-1.090035,1.058926


In [20]:
df_input.columns

Index(['Purchase Amount', 'Age', 'Account Type_bank_account',
       'Account Type_credit_card', 'Consumer Gender_female',
       'Consumer Gender_male', 'SIC Description_Book Stores',
       'SIC Description_Communications Services, Not Elsewhere Classified',
       'SIC Description_Drug Stores and Proprietary Stores',
       'SIC Description_Eating Places',
       'SIC Description_Family Clothing Stores',
       'SIC Description_Gasoline Service Stations',
       'SIC Description_Grocery Stores', 'SIC Description_Other',
       'SIC Description_Radiotelephone Communications',
       'SIC Description_Taxicabs', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday', 'period_of_month_end',
       'period_of_month_mid', 'period_of_month_start', 'retailerVec_01',
       'retailerVec_02', 'retailerVec_03', 'retailerVec_04', 'retailerVec_05',
       'retailerVec_06', 'ret

In [21]:
# convert pd frame to np array and indicate categorical and oridinal columns
data = df_input.to_numpy()
categorical_columns = [x for x in range(2,26)]
ordinal_columns = [1]

In [22]:
print(categorical_columns)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


In [23]:
# train the synthesizer
start = datetime.datetime.now()

synthesizer = TVAESynthesizer()
synthesizer.fit(data, categorical_columns, ordinal_columns)

print("TVAE training time: " + str(datetime.datetime.now()-start))





TVAE training time: 0:00:31.939148


In [24]:
# save the synthesizer
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # overwrite any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

#save_object(synthesizer, 'models/TVAE_synthesizer_test.pkl')

In [25]:
# load synthesizer from saved object
#with open('models/TVAE_synthesizer.pkl', 'rb') as input:
#    synthesizer = pickle.load(input)

In [26]:
# check out sample
sampled = synthesizer.sample(2)
np.set_printoptions(suppress = True, precision = 2)
print(sampled)

[[17.29 27.    1.    0.    1.    0.    0.    0.    0.    1.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.65 -0.36 -0.11 -0.47  0.04  0.53 -0.43 -0.27 -0.4   0.33
   0.94  0.24  0.72 -0.84 -0.14  0.16  0.42  0.31 -0.62  0.61]
 [19.89 30.    1.    0.    1.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.89  0.11  0.24 -0.13  0.27  0.71 -0.26 -0.25  0.97 -0.9
  -0.08 -0.56 -0.88 -0.75  0.56  0.3   0.23  0.67 -0.34  0.84]]


In [27]:
# convert sample back to readable categories
sample = data[0:3]
print(sample)

[[15.52 22.    0.    1.    0.    1.    0.    0.    0.    1.    0.    0.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    0.
   0.    1.    0.21 -0.27 -0.35 -1.1   0.52  0.75 -0.51  0.39  0.44  0.12
   0.7   1.11  0.03 -1.07  0.05  0.01 -0.19 -0.13 -0.97  0.69]
 [11.29 30.    1.    0.    0.    1.    0.    0.    0.    1.    0.    0.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.    0.36 -0.45  0.19 -0.99 -0.2   0.6  -0.41 -0.04 -0.13  1.55
  -0.14  0.28  0.91 -0.16 -0.58  1.11  0.59 -0.15 -0.48  0.62]
 [10.78 34.    0.    1.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.    0.    0.    0.    1.    0.    0.    0.    0.    0.    1.
   0.    0.   -1.06  0.04  0.02 -0.92  1.33  1.47 -0.16  0.04 -1.75 -0.17
   0.13 -0.19 -1.72 -0.74 -0.58  2.01 -0.02  0.68 -2.56 -1.05]]


In [64]:
df_sample = pd.DataFrame(sample, columns=list(df_input.columns)) 
print(df_sample.columns)
df_sample.head()

Index(['Purchase Amount', 'Age', 'Account Type_bank_account',
       'Account Type_credit_card', 'Consumer Gender_female',
       'Consumer Gender_male', 'SIC Description_Book Stores',
       'SIC Description_Communications Services, Not Elsewhere Classified',
       'SIC Description_Drug Stores and Proprietary Stores',
       'SIC Description_Eating Places',
       'SIC Description_Family Clothing Stores',
       'SIC Description_Gasoline Service Stations',
       'SIC Description_Grocery Stores', 'SIC Description_Other',
       'SIC Description_Radiotelephone Communications',
       'SIC Description_Taxicabs', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday', 'period_of_month_end',
       'period_of_month_mid', 'period_of_month_start', 'retailerVec_01',
       'retailerVec_02', 'retailerVec_03', 'retailerVec_04', 'retailerVec_05',
       'retailerVec_06', 'ret

Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,15.52,22.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,11.29,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,10.78,34.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045


In [55]:
# return series of dummy variables with given column name
def reverse_dummy(df, col_name):
    # get index of columns that starts with col_name, for example, Gender_male, Gender_female for col_name = `Gender`
    idx = [i for i, s in enumerate(list(df.columns)) if col_name in s]
    tmp = df.iloc[:,idx]
    # convert selected dataframe to series (only works well when only one 1 in each row)
    df_output = pd.Series(tmp.columns[np.where(tmp!=0)[1]], name = col_name)
    # remove strings with col_name plus underscore
    df_output = df_output.map(lambda x: x.replace(col_name + '_',''))
    return df_output

In [61]:
df_num = df_sample.iloc[:,[0,1]]
df_account = reverse_dummy(df_sample,'Account Type')
df_gender = reverse_dummy(df_sample,'Consumer Gender')
df_SIC = reverse_dummy(df_sample,'SIC Description')
df_dw = reverse_dummy(df_sample,'day_of_week')
df_pm = reverse_dummy(df_sample,'period_of_month')

In [60]:
df_sample.loc[1,'Account Type_bank_account']= 0
df_sample

Unnamed: 0,Purchase Amount,Age,Account Type_bank_account,Account Type_credit_card,Consumer Gender_female,Consumer Gender_male,SIC Description_Book Stores,"SIC Description_Communications Services, Not Elsewhere Classified",SIC Description_Drug Stores and Proprietary Stores,SIC Description_Eating Places,...,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,15.52,22.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,11.29,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,10.78,34.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045


In [62]:
df_account

0    credit_card
1    credit_card
Name: Account Type, dtype: object

In [63]:
df_reverse = pd.concat([df_num,df_account,df_gender,df_SIC,df_dw,df_pm], axis=1)
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,day_of_week,period_of_month
0,15.52,22.0,credit_card,male,Eating Places,Monday,start
1,11.29,30.0,credit_card,male,Eating Places,Monday,end
2,10.78,34.0,,male,Gasoline Service Stations,Monday,end


In [32]:
# given period_of_month (start, mid or end) and year (1989) and month (6), return index of the possible days
def return_day_index(period_of_month, year, month):
    if period_of_month == 'start':
        return list(range(1,11))
    elif period_of_month == 'mid':
        return list(range(11,21))
    else:
        return list(range(21, pd.Period(str(year) + '-' + str(month)).days_in_month + 1))

In [33]:
# function to convert day_of_week and period_of_month back to Date, given a year and a month
# Example:
# input: day_of_week = 'Monday'; period_of_month = 'start'; Y = 2020; M = 2
# output: 2020-02-03 00:00:00

def return_date(day_of_week, period_of_month, Y, M):
    # return list of days
    D = return_day_index(period_of_month, Y, M)

    tmp = pd.DataFrame({'year': [str(Y) for i in range(len(D))],
                        'month': [str(M) for i in range(len(D))],
                        'day': D})
    
    # create table with each row of year, month and day in given period
    date_period = pd.to_datetime(tmp[['year', 'month', 'day']])

    # locate index of which date is the given day_of_week
    idx_list = [i for i, s in enumerate(date_period.dt.strftime('%A')) if day_of_week in s]

    # randomly pick up one index, since it's possible one period has multiple given weekday (say Monday)
    idx = random.choice(idx_list)

    return date_period[idx]

In [34]:
Y = 2022; M = 2
df_reverse['Date'] = df_reverse.apply(lambda x: return_date(x.day_of_week, x.period_of_month, Y, M), axis = 1)
df_reverse.drop(['day_of_week','period_of_month'], axis = 1, inplace = True, errors = 'ignore')
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,Date
0,15.52,22.0,credit_card,male,Eating Places,2022-02-07
1,11.29,30.0,bank_account,male,Eating Places,2022-02-21
2,10.78,34.0,credit_card,male,Gasoline Service Stations,2022-02-21


In [35]:
# reverse retailerVec back to retailers
idx = [i for i, s in enumerate(list(df_sample.columns)) if 'retailerVec' in s]
df_retailerVec = df_sample.iloc[:,idx]
df_retailerVec.head()

Unnamed: 0,retailerVec_01,retailerVec_02,retailerVec_03,retailerVec_04,retailerVec_05,retailerVec_06,retailerVec_07,retailerVec_08,retailerVec_09,retailerVec_10,retailerVec_11,retailerVec_12,retailerVec_13,retailerVec_14,retailerVec_15,retailerVec_16,retailerVec_17,retailerVec_18,retailerVec_19,retailerVec_20
0,0.2078,-0.27456,-0.350884,-1.0994,0.518235,0.750903,-0.509509,0.389361,0.442397,0.117832,0.700974,1.109763,0.026182,-1.066037,0.050293,0.005771,-0.190992,-0.131103,-0.969268,0.692632
1,0.357645,-0.446232,0.187722,-0.985323,-0.203321,0.599157,-0.409267,-0.037081,-0.133219,1.5468,-0.140737,0.27713,0.908329,-0.156879,-0.579887,1.10559,0.594025,-0.154977,-0.479236,0.619747
2,-1.060268,0.038343,0.020715,-0.924681,1.332455,1.47236,-0.160993,0.042216,-1.751077,-0.167724,0.129194,-0.190291,-1.715097,-0.744412,-0.578674,2.006247,-0.022834,0.682452,-2.555646,-1.052045


In [36]:
## find out retailer with the most similar vector
#df_reverse['Retailer'] = df_retailerVec.apply(lambda x: model.similar_by_vector(x.to_numpy(),topn = 1)[0][0], axis = 1)
#df_reverse.head()

In [37]:
# load the retailer_map
with open('models/retailer_map.pkl', 'rb') as input:
    retailer_map = pickle.load(input)    
    
# only keep values in the model (more than 5 times appearance)
for key, value in retailer_map.items():
    retailer_map[key] = list(set(value) & set(model.wv.vocab))
    
# group other SIC (after top N) into other
other_list = []    
for key, value in retailer_map.items():
    if key not in list2keep:
        other_list = other_list + value

# delete other group key-values in retailer_map
retailer_map_grouped = {k: retailer_map[k] for k in list2keep}

# add other key-value pairs
retailer_map_grouped['Other'] = other_list

In [38]:
# put retailer vectors into one column
df_retailerVec['retailerVec']= df_retailerVec.values.tolist()
df_SIC_vector = pd.concat([df_SIC,df_retailerVec['retailerVec']],axis = 1)
df_SIC_vector.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,SIC Description,retailerVec
0,Eating Places,"[0.20780043303966522, -0.27456045150756836, -0..."
1,Eating Places,"[0.3576447069644928, -0.4462319314479828, 0.18..."
2,Gasoline Service Stations,"[-1.0602679252624512, 0.0383434072136879, 0.02..."


In [39]:
# find out retailer in the same SIC and also with the most similar vector
def return_retailer_SIC(model, retailer_map_grouped, sector, vector):
    word_list = retailer_map_grouped[sector]
    min_idx = model.wv.distances(vector, other_words = word_list).argmin()
    return word_list[min_idx]

In [40]:
df_reverse['Normalized Retailer'] = df_SIC_vector.apply(lambda x: return_retailer_SIC(model, retailer_map_grouped, x['SIC Description'],x['retailerVec']), axis = 1)
df_reverse.head()

Unnamed: 0,Purchase Amount,Age,Account Type,Consumer Gender,SIC Description,Date,Normalized Retailer
0,15.52,22.0,credit_card,male,Eating Places,2022-02-07,Red Robin
1,11.29,30.0,bank_account,male,Eating Places,2022-02-21,California Thai
2,10.78,34.0,credit_card,male,Gasoline Service Stations,2022-02-21,Petro-Canada
