In [16]:
import numpy as np
import pandas as pd

# set random seed for reproducibility
np.random.seed(123)

# generate simulated data for age at diagnosis and HbA1c levels
n_samples = 100000
age = np.random.randint(1,12, size=n_samples)
hba1c = np.random.normal(loc=7, scale=1, size=n_samples)

# generate simulated data for genetic information and family history
genetic_info = np.random.choice(['Mutation', 'No mutation'], size=n_samples, p=[0.2, 0.8])
family_history = np.random.choice(['Yes', 'No'], size=n_samples, p=[0.1, 0.9])

# generate simulated data for clinical features and laboratory data
birth_weight = np.random.normal(loc=2.8, scale=0.5, size=n_samples)
developmental_delay = np.random.choice(['Yes', 'No'], size=n_samples, p=[0.15, 0.85])
insulin_level = np.random.normal(loc=5, scale=2, size=n_samples)

# create a dataframe to store the data
data = pd.DataFrame({'Age': age, 'HbA1c': hba1c, 'Genetic Info': genetic_info, 'Family History': family_history, 'Birth Weight': birth_weight, 'Developmental Delay': developmental_delay, 'Insulin Level': insulin_level})

# add a column for PNDM diagnosis based on a combination of features
threshold_age = 5
threshold_hba1c = 9
data['PNDM'] = np.where(((data['Age'] < threshold_age) & (data['Genetic Info'] == 'Mutation') & (data['Birth Weight'] < 3)) | ((data['HbA1c'] > threshold_hba1c) & (data['Family History'] == 'Yes') & (data['Developmental Delay'] == 'Yes') & (data['Insulin Level'] > 6)), 1, 0)

# display the first few rows of the data
data.head()

Unnamed: 0,Age,HbA1c,Genetic Info,Family History,Birth Weight,Developmental Delay,Insulin Level,PNDM
0,3,4.840927,Mutation,Yes,3.128268,No,5.585608,0
1,3,5.694742,Mutation,No,2.059342,No,3.141359,1
2,7,6.843595,No mutation,No,2.718667,Yes,4.639313,0
3,2,6.480186,No mutation,No,3.087017,No,6.217178,0
4,4,7.052861,Mutation,No,3.481472,No,3.368892,0


In [14]:
data.to_csv('PNDB.csv',index=False)

In [15]:
pd.read_csv('PNDB.csv')

Unnamed: 0,Age,HbA1c,Genetic Info,Family History,Birth Weight,Developmental Delay,Insulin Level,PNDM
0,3,4.840927,Mutation,Yes,3.128268,No,5.585608,0
1,3,5.694742,Mutation,No,2.059342,No,3.141359,1
2,7,6.843595,No mutation,No,2.718667,Yes,4.639313,0
3,2,6.480186,No mutation,No,3.087017,No,6.217178,0
4,4,7.052861,Mutation,No,3.481472,No,3.368892,0
...,...,...,...,...,...,...,...,...
99995,9,7.810662,No mutation,Yes,2.031233,No,3.059450,0
99996,11,7.163781,Mutation,Yes,3.003822,No,2.891719,0
99997,10,5.521820,No mutation,No,3.798452,Yes,6.015360,0
99998,10,7.539210,No mutation,No,3.117501,No,4.128981,0


In [1]:
# !pip install barnum
# !pip install randomname

In [2]:
# dir(barnum),dir(rm)
# barnum.create_city_state_zip(zip_code=None)[1] #Get city name
# print('adjective categories:', randomname.ADJECTIVES) #Print Available adj
# print('noun categories:', randomname.NOUNS) #Print available nouns

In [3]:
# import randomname

# # generate name using all categories
# # name = randomname.get_name()
# # or specify a subset of the categories
# name = randomname.get_name( noun=( 'wine'))
# # # or - you can take a bit more liberty about
# # name = randomname.generate(
# #     'v/fire', 'adj/music_theory', ('n/cats', 'n/food'))

# # # these contain the available groups
# # print('adjective categories:', randomname.ADJECTIVES)
# # print('noun categories:', randomname.NOUNS)

# print(name)

In [4]:
import barnum
import randomname as rm

import pandas as pd
import numpy as np

In [5]:
x={
    "Sales":np.random.randint(500,2000,10000),
    "Product":[rm.get_name( noun=( 'wine')) for i in range (10000)],
   'District':[barnum.create_city_state_zip(zip_code=None)[1] for i in range(10000)],
   'Date':[barnum.create_date() for i in range(10000)]
  }

df=pd.DataFrame(x)
df.head()

Unnamed: 0,Sales,Product,District,Date
0,514,aged-chinon,Branchville,2027-05-23 21:34:58.640361
1,1693,spry-champagne,Charleston,2024-03-26 21:34:58.640361
2,1026,local-marsanne,Boise,2023-10-02 21:34:58.640361
3,763,rounded-chardonnay,Pensacola,2027-07-21 21:34:58.640361
4,950,taxonomic-muscat,Round Rock,2024-08-02 21:34:58.640361


In [6]:
df2=df.set_index(pd.DatetimeIndex(df['Date'])).drop('Date',axis=1).sort_index()
df2.head()

Unnamed: 0_level_0,Sales,Product,District
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-10-11 21:34:58.671609,571,pointed-arneis,Belton
2022-10-11 21:34:58.671609,1840,dichotomic-muscat,Palatine
2022-10-12 21:34:58.671609,1124,corn-burgundy,Wren
2022-10-14 21:34:58.655987,1854,achromatic-auslese,Bakers Mills
2022-10-14 21:34:58.655987,1122,optimal-chianti,Sanford
