# Data Generator

This notebook creates random data for DBS customers and Capital Land mall outlets.

Generated Data:
- dbs.csv: identity details (full name, email ID), transaction details at Capital Land Mall (outlet and mall name)
- capital_land.csv: mall name and outlet name
  
Assumptions:
- Size of Data: 
    - DBS: 100,000
    - individuals: 5,000
    - shops: 50
- Capital Land has 5 malls
- Each mall has 18 to 20 shops - (random)

Steps:
- Import libraries & environment
- Setup config file
- Define constants
- Complete Process: All stages are processed sequentially

Import libraries & environment

In [1]:
import random as r
import pandas as pd
import names
import yaml
import os

# load external libraries

current_directory = os.getcwd()
os.chdir('../src/utils')

import data_generator_functions as F

os.chdir(current_directory)

Setup config file

In [5]:
with open('..\\config\\config.yml', 'r', encoding='utf-8') as yml:
    config = yaml.load(yml, Loader=yaml.SafeLoader)

Define constants

In [6]:
# constants

n_dbs = config['Data Generator']['n_dbs']
n_ind = config['Data Generator']['n_ind']
n_shopsPerMallLow = config['Data Generator']['n_shops_per_mall_low']
n_shopsPerMallHigh = config['Data Generator']['n_shops_per_mall_high']
ls_shopName = config['Data Generator']['ls_shopName']
ls_mallName = config['Data Generator']['ls_mallName']

# seeds

seed_firstName = 101
seed_lastName = 102
seed_shopPerMall = None # otherwise fixed number of shops in each mall
seed_shopInMall = None  # otherwise same shops in each mall

KeyError: 'n_malls'

Complete Process

In [4]:
# individual data
# module name.FILES is the dictionary with file locations of the random names
ls_firstName , ls_lastName = F.individual_data(dict_name_files = names.FILES)

# shop data
F.shop_data(shop_name, mall_name, capital_land, 
            seed_shop_in_mall, seed_shop_per_mall, 
            n_malls, n_shops_per_mall_low,n_shops_per_mall_high)

# select individuals, in this case we are selecting 5000 individuals for 100,000 transactions
first_name, last_name, email = F.select_individuals(data_first_name , data_last_name,
                                                    seed_first_name, seed_last_name,
                                                    n_dbs, n_ind)

# generates DBS transaction for selected individuals
transaction_outlet, transaction_mall = F.transaction_data(mall_name, capital_land, n_dbs, n_malls, n_shops)

# convert data in data frame
df_dbs, df_capital_land = F.fill_data(first_name, last_name, email, transaction_outlet, transaction_mall, capital_land) # fills data frames


5494 88799


'\n# shop data\nF.shop_data(shop_name, mall_name, capital_land, \n            seed_shop_in_mall, seed_shop_per_mall, \n            n_malls, n_shops_per_mall_low,n_shops_per_mall_high)\n\n# select individuals, in this case we are selecting 5000 individuals for 100,000 transactions\nfirst_name, last_name, email = F.select_individuals(data_first_name , data_last_name,\n                                                    seed_first_name, seed_last_name,\n                                                    n_dbs, n_ind)\n\n# generates DBS transaction for selected individuals\ntransaction_outlet, transaction_mall = F.transaction_data(mall_name, capital_land, n_dbs, n_malls, n_shops)\n\n# convert data in data frame\ndf_dbs, df_capital_land = F.fill_data(first_name, last_name, email, transaction_outlet, transaction_mall, capital_land) # fills data frames'

In [None]:
#df_dbs.to_csv('..\\data\\'+config['File_Transaction'], index = False)
#df_capital_land.to_csv('..\\data\\'+config['File_Mall'], index = False)