# PART 1: DATA GENERATOR

**Objective:** This notebook executes the necessary steps to strategically generate a set of fabricated data to be used down the project pipeline.

---

In [431]:
# Data Management
import pandas as pd

# Data Manipulation
import numpy as np
import datetime
from faker import Faker
from random import shuffle
from scipy.stats import truncnorm
from datetime import timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Utils
%matplotlib inline


In [432]:
fake = Faker()

## 1A: Clinic Information

Objective: Generate a dataset consisting of clinic names, locations, and distances.

---

In [433]:
# Latitude / Longitude / Shortest-Distance information was retrieved from Google Maps 

cities = ['denver', 'edgewater', 'wheatridge', 'rino', 'lakewood']
clinics_df = pd.DataFrame({
    'to_denver': [0, 5, 6.3, 1.9, 7.5],
    'to_edgewater': [5.1, 0, 2, 7.8, 4.3],
    'to_wheatridge': [6.3, 2, 0, 7.5, 12],
    'to_rino':[2, 7.7, 7.8, 0, 11],
    'to_lakewood':[8, 4.4, 4.8, 10.7, 0]},
    index=cities) 

clinics_df['lat'] = [39.73906432357836, 39.753954449845445, 39.76685732722651, 39.767327859566265, 39.70455155721396]
clinics_df['lon'] = [-104.98969659655802, -105.06778796142915, -105.08198265044479, -104.98113186098168, -105.0798829449297]

In [434]:
clinics_df

Unnamed: 0,to_denver,to_edgewater,to_wheatridge,to_rino,to_lakewood,lat,lon
denver,0.0,5.1,6.3,2.0,8.0,39.739064,-104.989697
edgewater,5.0,0.0,2.0,7.7,4.4,39.753954,-105.067788
wheatridge,6.3,2.0,0.0,7.8,4.8,39.766857,-105.081983
rino,1.9,7.8,7.5,0.0,10.7,39.767328,-104.981132
lakewood,7.5,4.3,12.0,11.0,0.0,39.704552,-105.079883


## 1B: Patient Records (past)

Objective: Generate a dataset consisting of past patient records including the location & date/time of visit.

---

In [None]:
# Set seed


In [448]:
ppd_denver = np.random.normal(0, 4, 100)
ppd_denver = ppd_denver + 60
ppd_denver = ppd_denver.astype(int)

ppd_edgewater = np.random.normal(0, 3, 100)
ppd_edgewater = ppd_edgewater + 30
ppd_edgewater = ppd_edgewater.astype(int)

ppd_wheatridge = np.random.normal(0, 3, 100)
ppd_wheatridge = ppd_wheatridge + 30
ppd_wheatridge = ppd_wheatridge.astype(int)

ppd_rino = np.random.normal(0, 3, 100)
ppd_rino = ppd_rino + 30
ppd_rino = ppd_rino.astype(int)

ppd_lakewood = np.random.normal(0, 3.5, 100)
ppd_lakewood = ppd_lakewood + 50
ppd_lakewood = ppd_lakewood.astype(int)

print(ppd_denver)
print(ppd_edgewater)
print(ppd_wheatridge)
print(ppd_rino)
print(ppd_lakewood)

[63 57 63 58 62 71 65 60 59 57 57 59 59 59 67 62 60 63 50 59 63 57 53 60
 53 53 62 58 63 59 63 64 62 64 53 58 61 59 52 60 63 56 60 65 57 58 49 63
 67 55 63 62 65 62 59 64 53 55 59 58 66 54 56 64 57 57 55 62 55 58 54 50
 62 60 60 60 59 64 62 62 60 55 58 64 62 57 59 65 61 64 57 60 61 60 56 62
 60 60 55 59]
[31 33 22 35 29 28 33 33 37 30 35 24 35 27 29 29 26 33 30 30 28 29 34 30
 34 25 33 33 29 23 26 25 27 30 23 32 35 28 29 31 27 30 29 29 30 33 27 30
 28 31 27 32 26 25 22 30 28 29 31 32 31 29 30 26 34 31 29 31 30 33 29 26
 32 30 29 32 30 31 28 28 31 30 29 24 28 30 27 33 31 26 31 31 29 28 27 31
 36 35 30 30]
[27 29 28 27 32 29 31 27 34 31 27 32 28 36 33 28 30 33 30 27 27 28 27 25
 29 31 31 24 38 27 28 24 28 31 30 27 27 30 34 28 26 34 34 28 23 33 26 28
 34 30 29 30 35 31 26 29 30 31 31 31 30 22 29 26 27 32 27 36 29 29 34 27
 27 30 27 28 26 27 28 29 27 32 31 27 29 32 32 32 26 30 32 27 27 33 34 28
 33 29 32 34]
[25 32 27 32 31 32 31 32 32 27 36 30 33 31 31 29 31 27 30 28 32 27 33 29
 23 32 31

In [None]:
for i in ppd_denver:
    dates = pd.date_range(datetime.date(2022,1,1), periods=100).tolist()
    dates = [i.date() for i in dates]
    dates = dates*N
    dates.sort()
    dates_all.extend(dates)


#### Date of Visit

In [441]:
dates_all = []
for N in [60, 30, 30, 30, 50]:
    dates = pd.date_range(datetime.date(2022,1,1), periods=100).tolist()
    dates = [i.date() for i in dates]
    dates = dates*N
    dates.sort()
    dates_all.extend(dates)
    
len(dates_all)

20000

#### Time of Visit

In [444]:
### WEEKDAY GENERATOR
def weekday(mu1, sigma1, mu2, sigma2, N):
    X1 = np.random.normal(mu1, sigma1, int(N/2))
    X2 = np.random.normal(mu2, sigma2, int(N/2))
    X = np.concatenate([X1,X2])
    return X

### WEEKEND TIMES GENERATOR
def weekend(mu, sigma, N):
    
    # Limit normal distribution to operating times (when clinic is open)
    mean, sd, low, upp = 13, 3, 8, 20
    X = truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)
    X = X.rvs(N)
    
    return X

### TIME GENERATOR
def generate_times(noise):
    # noise = np.random.normal(0, .1, 100)
    for i in range(100):
        # N = random.choice(noise)
        # Weekend
        if (i % 7 == 6) or (i % 7 == 0):    
            X_all.extend(weekend(13, 3, noise[i]))
        # Weekday
        else:                               
            X_all.extend(weekday(11, 1, 17, 1, noise[i]))
            
X_all = []
generate_times(noise_denver)   # DENVER
generate_times(noise_edgewater)  # EDGEWATER
generate_times(noise_wheatridge)  # WHEATRIDGE
generate_times(noise_rino)          # RINO
generate_times(noise_lakewood)  # LAKEWOOD

print(len(X_all))
# X_all

19634


In [445]:
def make_time(x):
    dec = x - int(x)
    hour = int(x)
    min = int(dec*60)
    sec = int(dec*60*60 % 60)
    return datetime.time(hour, min, sec).strftime('%X')

#### Patient IDs

In [435]:
pids = []
for pid in range(10000, 30000):
    pids.append(pid)

#### Patient Names

In [436]:
first_names = []
last_names = []

for i in range(10000, 30000):
    first = "first_"+str(i)
    last = "last_"+str(i)
    first_names.append(first)
    last_names.append(last)

#### Date of Births

In [437]:
dobs = []

# Ages < 18 - 10% - 2000
for _ in range(2000):
    dobs.append(fake.date_of_birth(minimum_age=0, maximum_age=17))

# Ages 18-30 - 20% - 4000
for _ in range(4000):
    dobs.append(fake.date_of_birth(minimum_age=18, maximum_age=30))

# Ages 31-40 - 20% - 4000
for _ in range(4000):
    dobs.append(fake.date_of_birth(minimum_age=31, maximum_age=40))

# Ages 41-50 - 15% - 3000
for _ in range(3000):
    dobs.append(fake.date_of_birth(minimum_age=41, maximum_age=50))

# Ages 51-60 - 15% - 3000
for _ in range(3000):
    dobs.append(fake.date_of_birth(minimum_age=51, maximum_age=60))

# Ages >60 - 20% - 4000
for _ in range(4000):
    dobs.append(fake.date_of_birth(minimum_age=61, maximum_age=85))


#### Reason for Visit

In [438]:
reasons = {
    'cold/flu':3000,
    'covid':3000,
    'stomach':3000,
    'cough/chest':3000,
    'injury':1000,
    'throat':1000,
    'back_pain':3000,
    'blood':1000,
    'eyes':1000,
    'ears':1000,
}

r = []
for k, v in reasons.items():
    for i in range(v):
        r.append(k)

shuffle(r)
len(r)
# r

20000

In [439]:
reason_times = {
    'cold/flu': 45,
    'covid': 30,
    'stomach': 60,
    'cough/chest': 60,
    'injury': 60,
    'throat': 30,
    'back_pain': 60,
    'blood': 45,
    'eyes': 30,
    'ears': 30,
}

def checkout(x):
    ckeckin = datetime.datetime.strptime(x[1], '%H:%M:%S')
    checkout = ckeckin + timedelta(minutes=reason_times[x[0]])
    return checkout.time()



#### Location of Visit

In [440]:
loc = {
    'denver': 6000,
    'edgewater': 3000,
    'wheatridge': 3000,
    'rino':3000,
    'lakewood': 5000}

l = []
for k, v in loc.items():
    for i in range(v):
        l.append(k)
print(len(l))

20000


In [446]:
# weekday mornings before 10am
# weekday lunch 12-2pm
# weekday evening 4pm-8pm

# weekends 11am-8pm, peak at 3pm

# For 20k patients, we have 100 days (~3months of data) - 1st 100 days of 2022
# 28 days are weekends (sat/sun)
# 72 days are weekdays

#### Compile patient information to create patient dataset:

In [447]:
patients = pd.DataFrame()

patients['pid'] = pids
patients.set_index('pid', inplace=True)
patients['first_name'] = first_names
patients['last_names'] = last_names
patients['birth_date'] = dobs
patients['reason'] = r
patients['location'] = l
patients['visit_date'] = dates_all
patients['checkin_time'] = X_all
patients['checkin_time'] = patients['checkin_time'].map(make_time)
patients = patients.sort_values(['location', 'visit_date', 'checkin_time'])
patients['checkout_time'] = patients[['reason', 'checkin_time']].apply(checkout, axis=1)
patients['checkin_time'] = patients['checkin_time'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
patients

ValueError: Length of values (19634) does not match length of index (20000)

In [None]:
def tracker(loc, date):

    chin = patients[(patients.location==loc) & (patients.visit_date==date)].checkin_time.tolist()
    chout = patients[(patients.location==loc) & (patients.visit_date==date)].checkout_time.tolist()

    rolling_ct = []

    for i in range(len(chin)):
        tracker = 1
        current_chin = chin[i]

        for e in range(i):
            if chout[e] > current_chin:
                tracker += 1

        rolling_ct.append(tracker)

    return rolling_ct

total_rolling_ct = []
for loc in patients.location.unique():
    for date in patients.visit_date.unique():
        total_rolling_ct.extend(tracker(loc, date))

patients['current_num_patients'] = total_rolling_ct
patients

In [None]:
patients['day'] = patients['visit_date'].copy().apply(lambda x: x.weekday())
patients['weekend'] = patients['day'].copy().replace({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
patients.to_pickle('./patient_records.pickle')

In [None]:
# Old school way
# There should be 1 tech per 3 patients based on the peak number of patients that day
old_school = pd.DataFrame(patients.groupby(['location', 'visit_date']).max()['current_num_patients'])
old_school['num_techs'] = old_school['current_num_patients'].copy().apply(lambda x : round(x/3))
old_school['num_techs2'] = old_school['current_num_patients'].copy().apply(lambda x : round(x/4))
old_school

## 1C: Employee Records

Objective: Generate a dataset consisting employee names and IDs.

---

In [None]:
employees = pd.DataFrame(index=[i for i in range(10, 50)])
e_first_names = []
e_last_names = []
for i in range(10, 50):
    e_first_names.append('e_first_name_' + str(i))
    e_last_names.append('e_last_names' + str(i))
employees['e_first_names'] = e_first_names
employees['e_last_names'] = e_last_names

employees

## TEST VISUALS (FOR DEBUG PURPOSES)

In [None]:
fig = px.histogram(patients.birth_date)
fig.show()

In [None]:
fig = px.histogram(patients.reason)
fig.show()

In [None]:
fig = px.pie(patients, names='reason')
fig.show()