In [None]:
import openml
import pandas as pd
import datetime
import os
import json
import transformers
from src.tabby import MHTabbyGPT2Config, MHTabbyGPT2
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load real data

In [2]:
dataset = openml.datasets.get_dataset('diabetes')
df, _, _, _ = dataset.get_data(dataset_format="dataframe")

# rename and clean columns
cols = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age', 'diagnosis']
df.columns = cols 
df['diagnosis'] = df['diagnosis'].map(lambda x: 'positive' if x=='tested_positive' else 'negative')

# shuffle data
df = df.sample(frac=1, random_state=42, ignore_index=True)

# split into train/val/test sets used by paper
n = len(df)
train_size = int(0.75 * n)
val_size = int(0.075 * n)
train = df.iloc[:train_size, :]
val = df.iloc[train_size:train_size+val_size, :]
test = df.iloc[train_size+val_size:, :]
print('train', train.shape, 'val', val.shape, 'test', test.shape)

  dataset = openml.datasets.get_dataset('diabetes')


train (576, 9) val (57, 9) test (135, 9)


# Load model

In [3]:
config = MHTabbyGPT2Config.from_pretrained("sonicc/tabby-distilgpt2-diabetes")
model = MHTabbyGPT2.from_pretrained("sonicc/tabby-distilgpt2-diabetes")
tokenizer = transformers.AutoTokenizer.from_pretrained("sonicc/tabby-distilgpt2-diabetes")

In [4]:
column_names_tokens = tokenizer(list(train.columns), add_special_tokens=False).input_ids
token_heads = list(range( len(train.columns) ))
model.set_generation_mode(token_heads=token_heads, column_names_tokens=column_names_tokens)

# Perform Synthesis

In [5]:
n_samples = 5

outputs = []
for _ in range(n_samples):
    inputs = torch.full((1, 1), tokenizer.bos_token_id).to(model.device)
    toks = model.generate(inputs, do_sample=True, num_beams=1, max_length=10, pad_token_id=tokenizer.pad_token_id)[...,1:]
    outputs.append(tokenizer.batch_decode(toks)[0]) 
    
# parse the lines output by model
def parse_line(l):
    entries = l.split('<EOC>')[:-1] # remove newline at end
    words = [c.split(' ') for c in entries] #'name', 'is', 'value'
    d = dict()
    for c in words:
        if c[0] in cols and len(c) == 3 and c[0] not in d: # keep only first occurence
            d[c[0]] = c[2]
            
    if set(d.keys()) == set(cols):
        return d 
    else:
        return None

print('\n\n**Synthetic dataset:**')
dicts = [parse_line(out) for out in outputs]
pd.DataFrame(dicts)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




**Synthetic dataset:**


Unnamed: 0,pregnancies,glucose-plasma,blood-pressure,skin-thickness,insulin,BMI,pedigree,age,diagnosis
0,2.0,74.0,0.0,0.0,0.0,0.0,0.102,22.0,negative
1,1.0,90.0,62.0,18.0,59.0,25.1,1.268,25.0,negative
2,1.0,93.0,70.0,31.0,0.0,30.4,0.315,23.0,negative
3,7.0,184.0,84.0,33.0,0.0,35.5,0.355,41.0,positive
4,2.0,88.0,58.0,26.0,16.0,28.4,0.766,22.0,negative
