# Diabetes

In [3]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('diabetes')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

cols = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age', 'diagnosis']
ords = []
labs = ['diagnosis']
nums = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age']

df.columns = cols 
df['diagnosis'] = df['diagnosis'].map(lambda x: 'positive' if x=='tested_positive' else 'negative')

config = {
    'dataset_name': 'diabetes-new',
    'task': 'classification',
    'raw_path': "openml.datasets.get_dataset('diabetes')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': cols,
    'ords': ords,
    'nums': nums,
    'labs': labs,
}
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

  dataset = openml.datasets.get_dataset('diabetes')


train (576, 9) val (57, 9) test (135, 9)


# CA-Housing

In [4]:
import pandas as pd
import datetime
import os
import json
from sklearn.datasets import fetch_california_housing

df = fetch_california_housing(as_frame=True).frame

#rename cols so none start with same token
cols = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', 
        'occupancy', 'latitude', 'longitude', 'value_median_house']
df.columns = cols
ints = ['age_median', 'rooms', 'bedrooms', 'population', 'households', 'value_median_house']
df = df.fillna('?')

config = {
    'dataset_name': 'house-new',
    'raw_path': 'fetch_california_housing(as_frame=True).frame',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'task': 'regression',
}

config['cols'] = list(df.columns)
config["ords"] = []
config["nums"] = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', 
        'occupancy', 'latitude', 'longitude',]
config["labs"] = ["value_median_house"]
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

train (15480, 9) val (1548, 9) test (3612, 9)


# Rain

In [6]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('rainfall_bangladesh')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

config = {
    'dataset_name': 'rain',
    'task': 'regression',
    'raw_path': "openml.datasets.get_dataset('rainfall_bangladesh')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': list(df.columns),
}
config['ords'] = ['Station', 'Month']
config['nums'] = ['Year']
config['labs'] = ['Rainfall']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)

  dataset = openml.datasets.get_dataset('rainfall_bangladesh')


train (12566, 4) val (1256, 4) test (2933, 4)


# Abalone

In [3]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('abalone')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

config = {
    'dataset_name': 'abalone',
    'task': 'regression',
    'raw_path': "openml.datasets.get_dataset('abalone')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': list(df.columns),
}
config['ords'] = ['Sex']
config['nums'] = ['Length',	'Diameter',	'Height',	'Whole_weight',	'Shucked_weight',	'Viscera_weight',	'Shell_weight']
config['labs'] = ['Class_number_of_rings']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)

  dataset = openml.datasets.get_dataset('abalone')


train (3132, 9) val (313, 9) test (732, 9)
