In [1]:
# tab ddpm
ddpm_dir = '/mnt/data/sonia/ckpts/tab-ddpm'
import numpy as np
from typing import cast
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder
import pickle 
# https://github.com/yandex-research/rtdl-num-embeddings/blob/main/bin/datasets.py#L64
def tabddpm(config, train, val, test, alldf, outpath_date, outpath_latest):
    assert (train.dtypes == val.dtypes).all()
    assert (train.dtypes == test.dtypes).all()
    assert (train.columns == val.columns).all()
    assert (train.columns == test.columns).all()
    train = deepcopy(train)
    val = deepcopy(val)
    test = deepcopy(test)
    
    if config['task'] == 'classification':
        label_encoder = LabelEncoder()
        label_encoder.fit(alldf[config['labs'][0]])
        train[config['labs'][0]] = label_encoder.transform(train[config['labs'][0]])
        val[config['labs'][0]] = label_encoder.transform(val[config['labs'][0]])
        test[config['labs'][0]] = label_encoder.transform(test[config['labs'][0]])

    def get_Xy(df, config):
        df = deepcopy(df)
        y = df.pop(config['labs'][0]).astype('int64')
        d = {'y': y}
        xnum = df.loc[:, config['nums']].values
        if xnum.shape[1] > 0:
            d['X_num'] = xnum
        xcat = df.loc[:,  config['ords']].values 
        if xcat.shape[1] > 0:
            d['X_cat'] = xcat
        return d
        
    traindict = get_Xy(train, config)
    trainidx = np.arange(0, len(train))
    valdict = get_Xy(val, config)
    validx = np.arange(len(train), len(train)+len(val))
    testdict = get_Xy(test, config)
    testidx = np.arange(len(train)+len(val), len(train)+len(val)+len(test))
    
    datedirname = '.'.join(config['creation_time'].split())
    task_type = 'regression'
    if config['task'] == 'classification' and len(label_encoder.classes_) == 2:
        task_type = 'binclass'
    elif config['task'] == 'classification':
        task_type = 'multiclass'
    info = {
        'name': config['dataset_name'],
        'id': datedirname,
        'task_type': task_type,
        'n_num_features': len(config['nums']),
        'n_cat_features': len(config['ords']),
        'train_size': len(train),
        'val_size': len(val),
        'test_size': len(test)
    }
    
    outpath_date_tddpm = os.path.join(outpath_date, 'tab-ddpm')
    outpath_latest_tddpm = os.path.join(outpath_latest, 'tab-ddpm')
    ddpm_data_dir = os.path.join(ddpm_dir, config['dataset_name'])
    for path in [outpath_date_tddpm, outpath_latest_tddpm, ]:
        os.makedirs(path, exist_ok=True)
        with open(os.path.join(path, 'info.json'), 'w') as f:
            f.write(json.dumps(info, indent=4))
        if config['task'] == 'classification':
            with open(os.path.join(path, 'label_encoder.pkl'), 'wb') as file:
                pickle.dump(label_encoder, file)
        for name, npy in traindict.items():
            np.save(os.path.join(path, f'{name}_train.npy'), npy)
        for name, npy in valdict.items():
            np.save(os.path.join(path, f'{name}_val.npy'), npy)
        for name, npy in testdict.items():
            np.save(os.path.join(path, f'{name}_test.npy'), npy)
        np.save(os.path.join(path, 'idx_train.npy'), trainidx)
        np.save(os.path.join(path, 'idx_val.npy'), validx)
        np.save(os.path.join(path, 'idx_test.npy'), testidx)
        
    return traindict, valdict, testdict

# Sick

In [2]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('sick')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

config = {
    'dataset_name': 'sick',
    'task': 'classification',
    'raw_path': "openml.datasets.get_dataset('sick')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': list(df.columns),
}

config['ords'] = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
                  'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 
                  'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
                  'TBG_measured', 'referral_source']
config['nums'] = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG', ]
config['labs'] = ['Class']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

  dataset = openml.datasets.get_dataset('sick')


train (2829, 30) val (282, 30) test (661, 30)


# Adult

In [2]:
import pandas as pd
import datetime
import os
import json

config = {
    'dataset_name': 'adult',
    'raw_path': './adult.csv',
    'task': 'classification',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': ['age', 'class', 'financial-weight', 'education', 'years-education', 'marital-status', 'occupation', 'relationship', 
        'race', 'sex', 'gain-capital', 'loss-capital', 'hours-per-week', 'native-country', 'income'],
    'ords': ['class', 'education', 'marital-status', 'occupation', 'relationship',
        'race', 'sex', 'native-country'],
    'nums': ['age', "financial-weight", 'years-education', 'gain-capital', 'loss-capital', 'hours-per-week'],
    'labs': ['income']
}

# read in, rename columns
df = pd.read_csv(config['raw_path'])
df.columns = config['cols']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

train (36631, 15) val (3663, 15) test (8548, 15)


# Diabetes

Same processing, but without get_dummies() as https://huggingface.co/datasets/imodels/diabetes-readmission

In [4]:
import pandas as pd
import datetime
import os
import json
from ucimlrepo import fetch_ucirepo 
import numpy as np

config = {
    'dataset_name': 'diabetes',
    'raw_path': 'fetch_ucirepo(id=296)["data"]["original"]',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'task': 'classification',
}
# fetch dataset 
df = fetch_ucirepo(id=296)['data']['original']

#preprocessing
df['readmitted'] = df['readmitted'].replace({'NO': 'no', '>30': 'yes', '<30': 'yes'}) #target
df = df[df['gender'] != 'Unknown/Invalid']
df['age'] = df['age'].replace({"[70-80)":"70+",
                               "[60-70)":"[50-70)",
                               "[50-60)":"[50-70)",
                               "[80-90)":"70+",
                               "[40-50)":"[20-50)",
                               "[30-40)":"[20-50)",
                               "[90-100)":"70+",
                               "[20-30)":"[20-50)"})
df['admission_type_id'] = df['admission_type_id'].replace({1.0:"Emergency",
                                                           2.0:"Emergency",
                                                           3.0:"Elective",
                                                           4.0:"Newborn",
                                                           5.0:'?',
                                                           6.0:'?',
                                                           7.0:"Trauma Center",
                                                           8.0:'?'})
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(
    {1:"Discharged-Home",
     6:"Discharged-Home",
     8:"Discharged-Home",
     13:"Discharged-Home",
     19:"Discharged-Home",
     18:'?', 25:'?', 26:'?',
     2:"Other", 3:"Other", 4:"Other",
     5:"Other", 7:"Other", 9:"Other",
     10:"Other", 11:"Other", 12:"Other",
     14:"Other", 15:"Other", 16:"Other",
     17:"Other", 20:"Other", 21:"Other",
     22:"Other", 23:"Other", 24:"Other",
     27:"Other", 28:"Other", 29:"Other", 30:"Other"}
) 
df['admission_source_id'] = df['admission_source_id'].replace(
    {1:"Referral", 2:"Referral", 3:"Referral", 4:"Transfer",
     5:"Transfer", 6:"Transfer", 7:"Emergency", 8:"Other",
     9:"Other", 10:"Transfer", 11:"Other", 12:"Other",
     13:"Other", 14:"Other", 15:'?', 17:'?', 
     18:"Transfer", 19:"Other", 20:'?', 21:'?',
     22:"Transfer", 23:"Other", 24: "Other", 25:"Transfer",
     26: "Transfer"}
)
df['medical_specialty'] = df['medical_specialty'].replace(
    {"Orthopedics-Reconstructive": "Orthopedics",
     "Surgeon": "Surgery-General",
     "Surgery-Cardiovascular": "Surgery-Cardiovascular/Thoracic",
     "Surgery-Thoracic": "Surgery-Cardiovascular/Thoracic",
     "Pediatrics-Endocrinology": "Pediatrics",
     "Pediatrics-CriticalCare": "Pediatrics",
     "Pediatrics-Pulmonology": "Pediatrics",
     "Radiologist": "Radiology",
     "Oncology": "Hematology/Oncology",
     "Hematology": "Hematology/Oncology",
     "Gynecology": "Obstetrics/Gynecology",
     "Obstetrics": "Obstetrics/Gynecology"
     }
)
df['medical_specialty'] = df['medical_specialty'].replace(
    {spec: "Other" for spec in df['medical_specialty'].value_counts().index.values[15:]}
)
def map_diagnosis(data, cols):
    for col in cols:
        data.loc[(data[col].str.contains("V")) | (data[col].str.contains("E")), col] = -1
        data[col] = data[col].astype(np.float16)

    for col in cols:
        data["temp_diag"] = np.nan
        data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
        data.loc[(data[col]>=460) & (data[col]<=519) | (data[col]==786), "temp_diag"] = "Respiratory"
        data.loc[(data[col]>=520) & (data[col]<=579) | (data[col]==787), "temp_diag"] = "Digestive"
        data.loc[(data[col]>=680) & (data[col]<=709) | (data[col]==782), "temp_diag"] = "Skin"
        data.loc[(data[col]>=240) & (data[col]<250) | (data[col]>251) & (data[col]<=279), "temp_diag"] = "Non-diabetes;endocrine/metabolic"
        data.loc[(data[col]>=250) & (data[col]<251), "temp_diag"] = "Diabetes"
        data.loc[(data[col]>=800) & (data[col]<=999), "temp_diag"] = "Injury"
        data.loc[(data[col]>=710) & (data[col]<=739), "temp_diag"] = "Musculoskeletal"
        data.loc[(data[col]>=580) & (data[col]<=629) | (data[col] == 788), "temp_diag"] = "Genitourinary"
        data.loc[(data[col]>=140) & (data[col]<=239), "temp_diag"] = "Neoplasms"
        data.loc[(data[col]>=290) & (data[col]<=319), "temp_diag"] = "Mental"
        data.loc[(data[col]>=1) & (data[col]<=139), "temp_diag"] = "Infectious"

        data["temp_diag"] = data["temp_diag"].fillna("Other")
        data[col] = data["temp_diag"]
        data = data.drop("temp_diag", axis=1)

    return data
df = map_diagnosis(df, ["diag_1","diag_2","diag_3"])
df['change'] = df['change'].replace({'Ch': 'yes', 'No': 'no'})
all_meds = df.columns[24:47]
keep_meds = all_meds.values[
    [(df[med].value_counts().shape[0] > 1) and (df[med].value_counts()['Steady'] > 30) for med in all_meds]
]
drop_meds = all_meds.values[~all_meds.isin(keep_meds)]
print('keep meds', keep_meds)

drop_columns = ['encounter_id', 'patient_nbr', 'weight', 'payer_code'] + drop_meds.tolist()
df = df.drop(drop_columns, axis=1)

# specify column types
config['max_col_length'] = 20
config['cols'] = list(df.columns)
config['ords'] = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                  'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', ] + list(keep_meds) +\
                 ['change', 'diabetesMed']
config['nums'] = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
                  'number_emergency', 'number_inpatient', 'number_diagnoses']
config['labs'] = ['readmitted']
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

df = df.fillna('?')
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

  df = pd.read_csv(data_url)
  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"


keep meds ['metformin' 'repaglinide' 'nateglinide' 'chlorpropamide' 'glimepiride'
 'glipizide' 'glyburide' 'pioglitazone' 'rosiglitazone' 'acarbose'
 'miglitol' 'tolazamide' 'insulin' 'glyburide-metformin']
train (76322, 37) val (7632, 37) test (17809, 37)


# Diabetes NEW

In [3]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('diabetes')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

cols = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age', 'diagnosis']
ords = []
labs = ['diagnosis']
nums = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age']

df.columns = cols 
df['diagnosis'] = df['diagnosis'].map(lambda x: 'positive' if x=='tested_positive' else 'negative')

config = {
    'dataset_name': 'diabetes-new',
    'task': 'classification',
    'raw_path': "openml.datasets.get_dataset('diabetes')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': cols,
    'ords': ords,
    'nums': nums,
    'labs': labs,
}
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

  dataset = openml.datasets.get_dataset('diabetes')


train (576, 9) val (57, 9) test (135, 9)


# CA Housing

In [6]:
import pandas as pd
import datetime
import os
import json

df = pd.read_csv("hf://datasets/leostelon/california-housing/housing.csv")

#rename cols so none start with same token
cols = ['longitude', 'latitude', 'age_median', 'rooms', 'bedrooms', 'population', 
        'households', 'income_median', 'value_median_house', 'ocean_proximity']
df.columns = cols
ints = ['age_median', 'rooms', 'bedrooms', 'population', 'households', 'value_median_house']
df = df.fillna('?')
def mapping(v):
    if v == '?': return '?'
    else: return int(v)
df[ints] = df[ints].map(mapping)
# df[df.isna()] = '?'
# df[ints] = df[ints].astype(int)

config = {
    'dataset_name': 'house',
    'raw_path': 'hf://datasets/leostelon/california-housing/housing.csv',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'task': 'regression',
}

config['cols'] = list(df.columns)
config["ords"] = ["ocean_proximity"]
config["nums"] = ["longitude", "latitude", "age_median", "rooms", "bedrooms", "population", "households", "income_median"]
config["labs"] = ["value_median_house"]
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

  from .autonotebook import tqdm as notebook_tqdm


train (15480, 10) val (1548, 10) test (3612, 10)


# CA-Housing NEW

In [4]:
import pandas as pd
import datetime
import os
import json
from sklearn.datasets import fetch_california_housing

df = fetch_california_housing(as_frame=True).frame

#rename cols so none start with same token
cols = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', 
        'occupancy', 'latitude', 'longitude', 'value_median_house']
df.columns = cols
ints = ['age_median', 'rooms', 'bedrooms', 'population', 'households', 'value_median_house']
df = df.fillna('?')

config = {
    'dataset_name': 'house-new',
    'raw_path': 'fetch_california_housing(as_frame=True).frame',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'task': 'regression',
}

config['cols'] = list(df.columns)
config["ords"] = []
config["nums"] = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', 
        'occupancy', 'latitude', 'longitude',]
config["labs"] = ["value_median_house"]
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

train (15480, 9) val (1548, 9) test (3612, 9)


# CA Housing New Tiny

In [1]:
import pandas as pd
import datetime
import os
import json
from sklearn.datasets import fetch_california_housing

df = fetch_california_housing(as_frame=True).frame

config = {
    'dataset_name': 'house-new-tiny',
    'raw_path': 'fetch_california_housing(as_frame=True).frame',
    'random_state': 42,
    'train_frac': 0.25,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'task': 'regression',
}

#rename cols so none start with same token
cols = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', 
        'occupancy', 'latitude', 'longitude', 'value_median_house']
df.columns = cols
ints = ['age_median', 'rooms', 'bedrooms', 'population', 'households', 'value_median_house']
df = df.fillna('?')



config['cols'] = ['income_median','age_median','rooms','bedrooms','occupancy','value_median_house']
config["ords"] = []
config["nums"] = ['income_median', 'age_median', 'rooms', 'bedrooms', 
        'occupancy',]
config["labs"] = ["value_median_house"]
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])
df = df[config['cols']]

df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
# ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

train (5160, 6) val (1548, 6) test (13932, 6)


# Rain

In [6]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('rainfall_bangladesh')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

config = {
    'dataset_name': 'rain',
    'task': 'regression',
    'raw_path': "openml.datasets.get_dataset('rainfall_bangladesh')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': list(df.columns),
}
config['ords'] = ['Station', 'Month']
config['nums'] = ['Year']
config['labs'] = ['Rainfall']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)

  dataset = openml.datasets.get_dataset('rainfall_bangladesh')


train (12566, 4) val (1256, 4) test (2933, 4)


# Abalone

In [3]:
import openml
import pandas as pd
import datetime
import os
import json

dataset = openml.datasets.get_dataset('abalone')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

config = {
    'dataset_name': 'abalone',
    'task': 'regression',
    'raw_path': "openml.datasets.get_dataset('abalone')",
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': list(df.columns),
}
config['ords'] = ['Sex']
config['nums'] = ['Length',	'Diameter',	'Height',	'Whole_weight',	'Shucked_weight',	'Viscera_weight',	'Shell_weight']
config['labs'] = ['Class_number_of_rings']

assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)

  dataset = openml.datasets.get_dataset('abalone')


train (3132, 9) val (313, 9) test (732, 9)


# Travel

In [1]:
import pandas as pd
import datetime
import os
import json

config = {
    'dataset_name': 'travel',
    'raw_path': 'https://www.kaggle.com/datasets/tejashvi14/tour-travels-customer-churn-prediction?resource=download',
    'task': 'classification',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': ['Age','Frequent-Flyer','Class','Services','Social-Media','Hotel','Target'],
    'ords': ['Frequent-Flyer','Class','Social-Media','Hotel'],
    'nums': ['Age','Services',],
    'labs': ['Target']
}

# read in, rename columns
df = pd.read_csv('travel.csv')
assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) 
assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])

df[config['ords']] = df[config['ords']].map(lambda x: '-'.join(x.split(' ')))

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        
# ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

train (715, 7) val (71, 7) test (168, 7)


# cautab

In [2]:
%cd ~
!git clone https://github.com/TURuibo/CauTabBench.git 
%cd CauTabBench
!python process_sim_dataset.py --seed 100 --cm lg

import pandas  as pd 
import json 
import os
import datetime
from shutil import copyfile

config = {
    'dataset_name': 'cautab',
    'raw_path': 'python process_sim_dataset.py --seed 100 --cm lg',
    'random_state': 42,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'task': 'classification',
}
config['cols'] = ['V0','V1','V2','V3','V4','V5','V6','V7','V8','V9','target']
config['nums'] = ['V0','V1','V2','V3','V4','V5','V6','V7','V8','V9',]
config['labs'] = ['target']

df = pd.read_csv('~/CauTabBench/data/sim_lg/100/train.csv')
# split into train/val/test sets
n = len(df)
val_size = int(config['val_frac'] * n)
train = df.iloc[:-val_size, :]
val = df.iloc[-val_size:, :]
print('train', train.shape, 'val', val.shape,)
test = pd.read_csv('~/CauTabBench/data/sim_lg/100/train.csv')
alls = pd.concat([df, test], axis=0)

%cd ~/tabby/
# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    # copyfile('~/CauTabBench/data/sim_lg/100/test.csv', os.path.join(path, 'val.csv'))
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    alls.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)
        

/home/sonia
fatal: destination path 'CauTabBench' already exists and is not an empty directory.


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/home/sonia/CauTabBench
100
sim_lg (17117, 11) (1902, 11) (19019, 11)
Numerical (17117, 10)
Categorical (17117, 0)
Processing and Saving sim_lg Successfully!
sim_lg
Total 19019
Train 17117
Test 1902
Num 10
Cat 1
train (15834, 11) val (1283, 11)
/home/sonia/tabby


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# Shodan

In [5]:
import pandas as pd
import string
import pandas as pd 
import json 
import datetime
from copy import deepcopy
import os

raw = '/mnt/data/sonia/datasets/honeygan/data/processed/data.csv'
df = pd.read_csv(raw)

# df['os_generic'].value_counts('os_generic')
keep_os = ['mikrotik routeros', 'windows server', 'windows', 'ubuntu']
df = df[df['os_generic'].isin(keep_os)]
df = df[df['cpe_count']==1]

rename_os = {'mikrotik routeros': 'mikrotik', 
             'windows server': 'server-windows', 
             'windows': 'windows', 
             'synology diskstation manager (dsm)': 'synology', 
             'ubuntu': 'ubuntu'}
df['os_generic'] = df['os_generic'].map(lambda x: rename_os[x])
df.drop(['os_generic', 'cpe_count'], axis=1, inplace=True)
df.dropna(inplace=True)
df = df.map(lambda x: ''.join(str(x).split(' ')))
df['cpe'] = df['cpe'].map(lambda x: x[2:-2])
df = df.iloc[:30000,:]
df.columns = ['os', 'ip', 'port', 'module', 'cpe', 'category']

config = {
    'dataset_name': 'shodan',
    'task': 'classification',
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
    'cols': ['os','ip','port','module','cpe','category'],
    'ords': ['os','ip','module','cpe','category'],
    'nums': ['port',]
}

# shuffle data
df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)

# split into train/val/test sets
n = len(df)
train_size = int(config['train_frac'] * n)
val_size = int(config['val_frac'] * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

# write everything out
datedirname = '.'.join(config['creation_time'].split())
outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

for path in [outpath_date, outpath_latest]:
    os.makedirs(path, exist_ok=True)
    train.to_csv(os.path.join(path, 'train.csv'), index=False)
    val.to_csv(os.path.join(path, 'val.csv'), index=False)
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    # copyfile('~/CauTabBench/data/sim_lg/100/test.csv', os.path.join(path, 'val.csv'))
    test.to_csv(os.path.join(path, 'test.csv'), index=False)
    df.to_csv(os.path.join(path, 'all.csv'), index=False)
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config, f)

# Inria Benchmark

In [None]:
from datasets import load_dataset, get_dataset_config_names, load_dataset_builder
import string
import pandas as pd 
import json 
import datetime
from copy import deepcopy
import os

defaultconfig = {
    'random_state': 42,
    'train_frac': 0.75,
    'val_frac': 0.075,
    'creation_time': str(datetime.datetime.now()),
    'max_col_length': 20,
}

skip = ['clf_num_california', 'clf_num_Diabetes130US']

prepend = [str(num) for num in range(10)] + list(string.ascii_lowercase)
prepend = [e+'. ' for e in prepend] #36 items

names = get_dataset_config_names("inria-soda/tabular-benchmark")
# names = ['clf_cat_albert']
for name in names:
    if name in skip:
        continue
    
    ds = load_dataset_builder("inria-soda/tabular-benchmark", name)
    if len(ds.info.features) > len(prepend):
        skip.append(name)
        continue
    
    df = load_dataset("inria-soda/tabular-benchmark", name)['train'].to_pandas() # only has a train split
    ncols = len(df.columns)
    df.columns = [pre+col for pre,col in zip(prepend[:ncols], df.columns)]
    
    config = deepcopy(defaultconfig)
    config['dataset_name'] = name
    config['cols'] = list(df.columns)
    config['labs'] = [config['cols'][-1]] # last col is label
    if name.startswith('clf'):
        config['task'] = 'classification'
    elif name.startswith('reg'):
        config['task'] = 'regression'
    else:
        raise Exception('unknown task for', name)
    if name.startswith('clf_cat') or name.startswith('reg_cat'): # features are numerical or categorical
        # not best way, but just assume str cols are ordinal and non-str are numerical
        config['ords'] = list(df.dtypes[df.dtypes=='str'].index)
        config['nums'] = list(df.dtypes[df.dtypes!='str'].index)
    elif name.startswith('clf_num') or name.startswith('reg_num'): #features all numerical
        config['nums'] = list(df.columns)[:-1]
        config['ords'] = []
        
    df = df.fillna('?')
    df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)
    # split into train/val/test sets
    n = len(df)
    train_size = int(config['train_frac'] * n)
    val_size = int(config['val_frac'] * n)
    train = df.iloc[:train_size, :]
    val = df.iloc[train_size:train_size+val_size, :]
    test = df.iloc[train_size+val_size:, :]
    print(name, '\t\t\t\ttrain', train.shape, 'val', val.shape, 'test', test.shape)
    
    # write everything out
    datedirname = '.'.join(config['creation_time'].split())
    outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)
    outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')

    for path in [outpath_date, outpath_latest]:
        os.makedirs(path, exist_ok=True)
        train.to_csv(os.path.join(path, 'train.csv'), index=False)
        val.to_csv(os.path.join(path, 'val.csv'), index=False)
        test.to_csv(os.path.join(path, 'test.csv'), index=False)
        df.to_csv(os.path.join(path, 'all.csv'), index=False)
        with open(os.path.join(path, 'config.json'), 'w') as f:
            json.dump(config, f)
                    
    # ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)

print('skipped\n', skip)