# Dataset

*New dataset training checks*
- num_heads
- max_column_len
- dataset

What is cpe? https://www.acunetix.com/blog/articles/common-platform-enumeration-cpe-explained/

In [2]:
import pandas as pd
import numpy as np
import json
import datasets
np.random.seed(42)

# Shodan

In [None]:
datapath = '/mnt/data/sonia/datasets/honeygan/data/processed/data.csv'
df = pd.read_csv(datapath)
print(df.isna().sum())
df.head()

In [2]:
dfs = df[(~df.single_cpe.isna())]
dfs['port_str'] = dfs.apply(lambda x: str(x['port']), axis=1)
dfs = dfs[['os_generic', 'ip_str', 'port_str',]] #'ip_str'
dfs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs['port_str'] = dfs.apply(lambda x: str(x['port']), axis=1)


Unnamed: 0,os_generic,ip_str,port_str
1,ubuntu,43.205.13.243,22
2,ubuntu,43.205.13.243,80
3,ubuntu,43.205.13.243,443
4,windows,206.233.189.205,80
6,windows,206.233.189.205,8081


In [3]:
dfs['port_str'].unique() #longest is 4 digits

array(['22', '80', '443', '8081', '4433', '5000', '5001', '5985', '8080',
       '8443', '25', '3306', '23', '1433', '1723', '3389', '8291', '8728',
       '21', '53'], dtype=object)

In [4]:
# dfs['port_str'] = dfs['port_str'].apply(lambda x: (4-len(x))*'0' + x)
dfs['port_str'].head()

1      22
2      80
3     443
4      80
6    8081
Name: port_str, dtype: object

In [5]:
dfs.nunique()

os_generic        10
ip_str        130900
port_str          20
dtype: int64

In [6]:
dfs['os_generic'].unique()

array(['ubuntu', 'windows', 'windows server', 'debian',
       'synology diskstation manager (dsm)', 'sonicwall sonicos', 'qts',
       'linux', 'mikrotik routeros', 'synology router manager (srm)'],
      dtype=object)

In [7]:
prompt = datasets.Dataset.from_dict({'prompt': ['a', ' at IP', ', port', '',]})
prompt[:]

{'prompt': ['a', ' at IP', ', port', '']}

In [8]:
data = datasets.Dataset.from_pandas(dfs)
data = data.remove_columns("__index_level_0__")
data[:5]

{'os_generic': ['ubuntu', 'ubuntu', 'ubuntu', 'windows', 'windows'],
 'ip_str': ['43.205.13.243',
  '43.205.13.243',
  '43.205.13.243',
  '206.233.189.205',
  '206.233.189.205'],
 'port_str': ['22', '80', '443', '80', '8081']}

In [9]:
prompt_len = sum([len(chunk) for chunk in prompt['prompt']])
prompt_len

13

In [10]:
data = data.map(lambda x: {'length': sum([len(col) for col in x.values()]) + prompt_len})

Map:   0%|          | 0/219434 [00:00<?, ? examples/s]

In [11]:
split = data.train_test_split(test_size=0.1)
trainval, test = split['train'], split['test']
split = trainval.train_test_split(test_size=0.1)
train, val = split['train'], split['test']
print(len(train), len(val), len(test), len(train)/len(data), len(val)/len(data), len(test)/len(data))
ds = datasets.DatasetDict({
    'train': train,
    'eval': val,
    'test': test,
    'prompt': prompt})
ds

177741 19749 21944 0.809997539123381 0.08999972656926457 0.10000273430735437


DatasetDict({
    train: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 177741
    })
    eval: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 19749
    })
    test: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 21944
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 4
    })
})

## Save

In [17]:
ds.save_to_disk('/mnt/data/sonia/datasets/honeygan/may6.dat')

Saving the dataset (0/1 shards):   0%|          | 0/177741 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19749 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21944 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [14]:
import os
p='/mnt/data/sonia/honeygan/may6.dat'

if 'dataset_dict.json' in os.listdir(p):
    dataset = datasets.DatasetDict({})
    for f in os.listdir(p):
        if f.endswith('.json'): continue
        dataset[f] = datasets.load_from_disk(os.path.join('/mnt/data/sonia/honeygan/may6.dat', f))
        
dataset

DatasetDict({
    eval: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 19749
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 4
    })
    train: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 177741
    })
    test: Dataset({
        features: ['os_generic', 'ip_str', 'port_str', 'length'],
        num_rows: 21944
    })
})

In [15]:
dataset['prompt']['prompt']

['a', ' at IP', ', port', '']

In [16]:
feats = list(dataset['train'].features)
{k:v for k,v in zip(list(range(len(feats))), feats)}

{0: 'os_generic', 1: 'ip_str', 2: 'port_str', 3: 'length'}

In [11]:
from transformers import AutoTokenizer
tok_path = '/mnt/data/zoo/llama2/llama2-7b-hf/'
tokenizer = AutoTokenizer.from_pretrained(
    tok_path,
    cache_dir=None,
    padding_side="right",
    use_fast=False, # Fast tokenizer giving issues.
    tokenizer_type='llama',
    trust_remote_code=True,
)

# Adult

In [1]:
import pandas as pd
import numpy as np
import json
import datasets
np.random.seed(42)

datapath = '/mnt/data/sonia/datasets/adult/adult.csv'
df = pd.read_csv(datapath)
print(df.isna().sum())
df.head()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
keep_cols = ['age', 'sex', 'native-country', 'education', 'occupation', 'income']
df = df[keep_cols]
df['native-country'] = df.apply(lambda x: ' '.join(x['native-country'].strip().split('-')), axis=1)
df['age'] = df.apply(lambda x: str(x['age']), axis=1)
df['sex'] = df.apply(lambda x: x['sex'].strip(), axis=1)
df['education'] = df.apply(lambda x: x['education'].strip(), axis=1)
df['occupation'] = df.apply(lambda x: ' '.join(x['occupation'].strip().split('-')), axis=1)
inc_map = {' <=50K': 'under 50K', ' >50K': 'over 50K'}
df['income'] = df.apply(lambda x: inc_map[x['income']], axis=1)
df.nunique(), df.head()

(age               74
 sex                2
 native-country    42
 education         16
 occupation        15
 income             2
 dtype: int64,
   age     sex native-country  education         occupation     income
 0  39    Male  United States  Bachelors       Adm clerical  under 50K
 1  50    Male  United States  Bachelors    Exec managerial  under 50K
 2  38    Male  United States    HS-grad  Handlers cleaners  under 50K
 3  53    Male  United States       11th  Handlers cleaners  under 50K
 4  28  Female           Cuba  Bachelors     Prof specialty  under 50K)

In [27]:
df['occupation'].unique()

array(['Adm clerical', 'Exec managerial', 'Handlers cleaners',
       'Prof specialty', 'Other service', 'Sales', 'Craft repair',
       'Transport moving', 'Farming fishing', 'Machine op inspct',
       'Tech support', '?', 'Protective serv', 'Armed Forces',
       'Priv house serv'], dtype=object)

In [3]:
prompt = [
    "This person's age is",
    " sex is",
    " and country is",
    '. Education level is',
    ' occupation is',
    ' and income is',
    ""
]
prompt = datasets.Dataset.from_dict({'prompt': prompt})
prompt[:]

{'prompt': ["This person's age is",
  ' sex is',
  ' and country is',
  '. Education level is',
  ' occupation is',
  ' and income is',
  '']}

In [4]:
data = datasets.Dataset.from_pandas(df)
prompt_len = sum([len(chunk) for chunk in prompt])
data = data.map(lambda x: {'length': sum([len(col) for col in x.values()]) + prompt_len})
data[:5]

Map:   0%|          | 0/48842 [00:00<?, ? examples/s]

{'age': ['39', '50', '38', '53', '28'],
 'sex': ['Male', 'Male', 'Male', 'Male', 'Female'],
 'native-country': ['United States',
  'United States',
  'United States',
  'United States',
  'Cuba'],
 'education': ['Bachelors', 'Bachelors', 'HS-grad', '11th', 'Bachelors'],
 'occupation': ['Adm clerical',
  'Exec managerial',
  'Handlers cleaners',
  'Handlers cleaners',
  'Prof specialty'],
 'income': ['under 50K', 'under 50K', 'under 50K', 'under 50K', 'under 50K'],
 'length': [56, 59, 59, 56, 51]}

In [5]:
from transformers import AutoTokenizer
vocab_masks = {}
tokenizer = AutoTokenizer.from_pretrained("/mnt/data/zoo/llama2/llama2-7b-hf")
for i, column in enumerate(keep_cols):
    column=df[column].unique().tolist()
    tokenized = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in column]
    unique_toks = list(set(sum(tokenized, []))) + [0]

    vocab_size = 32000
    vec = np.zeros(vocab_size, dtype=int)
    vec[unique_toks] = 1
    vocab_masks[str(i+1)] = vec
vocab_masks = datasets.Dataset.from_dict(vocab_masks)

In [6]:
split = data.train_test_split(test_size=0.1)
trainval, test = split['train'], split['test']
split = trainval.train_test_split(test_size=0.1)
train, val = split['train'], split['test']
print(len(train), len(val), len(test), len(train)/len(data), len(val)/len(data), len(test)/len(data))
ds = datasets.DatasetDict({
    'train': train,
    'eval': val,
    'test': test,
    'prompt': prompt,
    'vocab_masks': vocab_masks})
ds

39561 4396 4885 0.8099791163343024 0.09000450432005241 0.10001637934564514


DatasetDict({
    train: Dataset({
        features: ['age', 'sex', 'native-country', 'education', 'occupation', 'income', 'length'],
        num_rows: 39561
    })
    eval: Dataset({
        features: ['age', 'sex', 'native-country', 'education', 'occupation', 'income', 'length'],
        num_rows: 4396
    })
    test: Dataset({
        features: ['age', 'sex', 'native-country', 'education', 'occupation', 'income', 'length'],
        num_rows: 4885
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 7
    })
    vocab_masks: Dataset({
        features: ['1', '2', '3', '4', '5', '6'],
        num_rows: 32000
    })
})

In [25]:
# pd.DataFrame(ds['vocab_masks'][:])['1'].to_numpy()
pd.DataFrame(ds['vocab_masks'][:]).to_numpy().T.shape

(6, 32000)

In [7]:
ds.save_to_disk('/mnt/data/sonia/datasets/adult/may8.dat')

Saving the dataset (0/1 shards):   0%|          | 0/39561 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4396 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4885 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32000 [00:00<?, ? examples/s]

# Diabetes

In [1]:
import pandas as pd
import numpy as np
import json
import datasets
np.random.seed(42)
datasethf = datasets.load_dataset("imodels/diabetes-readmission")
df_train = datasethf['train'].to_pandas()
df_test   = datasethf['test'].to_pandas()
df_train.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,change,diabetesMed,...,glyburide-metformin:Up,A1Cresult:>7,A1Cresult:>8,A1Cresult:None,A1Cresult:Norm,max_glu_serum:>200,max_glu_serum:>300,max_glu_serum:None,max_glu_serum:Norm,readmitted
0,2.0,38.0,3.0,27.0,0.0,1.0,2.0,7.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,4.0,48.0,0.0,11.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,2.0,28.0,0.0,15.0,0.0,3.0,4.0,9.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
3,4.0,44.0,0.0,10.0,0.0,0.0,0.0,7.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,3.0,54.0,0.0,8.0,0.0,0.0,0.0,8.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0


In [2]:
cols = df_train.columns
one_hot = set([c.split(':')[0] for c in cols if ':' in c])
one_hot

{'A1Cresult',
 'acarbose',
 'admission_source_id',
 'admission_type_id',
 'age',
 'chlorpropamide',
 'diag_1',
 'diag_2',
 'diag_3',
 'discharge_disposition_id',
 'gender',
 'glimepiride',
 'glipizide',
 'glyburide',
 'glyburide-metformin',
 'insulin',
 'max_glu_serum',
 'medical_specialty',
 'metformin',
 'miglitol',
 'nateglinide',
 'pioglitazone',
 'race',
 'repaglinide',
 'rosiglitazone',
 'tolazamide'}

In [3]:
drop_cols = []
for name in sorted(one_hot):
    print(name)
    df_train[name] = pd.from_dummies(df_train[[c for c in df_train.columns if name+':' in c]], sep=":", default_category="unknown")[name]
    df_test[name]  = pd.from_dummies( df_test[[c for c in df_train.columns if name+':' in c]], sep=":", default_category="unknown")[name]
    drop_cols.extend([c for c in df_train.columns if name+':' in c])
    drop_cols.extend([c for c in  df_test.columns if name+':' in c])

df_train.drop(drop_cols, axis=1, inplace=True)
df_test.drop( drop_cols, axis=1, inplace=True)
df_train.head()

A1Cresult
acarbose
admission_source_id
admission_type_id
age
chlorpropamide
diag_1
diag_2
diag_3
discharge_disposition_id
gender
glimepiride
glipizide
glyburide
glyburide-metformin
insulin
max_glu_serum
medical_specialty
metformin
miglitol
nateglinide
pioglitazone
race
repaglinide
rosiglitazone
tolazamide


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,change,diabetesMed,...,max_glu_serum,medical_specialty,metformin,miglitol,nateglinide,pioglitazone,race,repaglinide,rosiglitazone,tolazamide
0,2.0,38.0,3.0,27.0,0.0,1.0,2.0,7.0,1.0,1.0,...,,Orthopedics,No,No,No,Steady,Caucasian,No,No,No
1,4.0,48.0,0.0,11.0,0.0,0.0,0.0,9.0,0.0,0.0,...,,InternalMedicine,No,No,No,No,Caucasian,No,No,No
2,2.0,28.0,0.0,15.0,0.0,3.0,4.0,9.0,0.0,1.0,...,,unknown,No,No,No,No,AfricanAmerican,No,No,No
3,4.0,44.0,0.0,10.0,0.0,0.0,0.0,7.0,0.0,1.0,...,,unknown,No,No,No,No,Caucasian,No,No,No
4,3.0,54.0,0.0,8.0,0.0,0.0,0.0,8.0,1.0,1.0,...,,unknown,No,No,No,No,Caucasian,No,No,No


In [4]:
keep_cols = ['readmitted', 'time_in_hospital', 'diag_1', 'num_procedures', 'num_medications', 
             'number_outpatient', 'number_emergency', 'age', 'medical_specialty']
df_train = df_train[keep_cols]
df_test = df_test[keep_cols]

In [5]:
readmitted = {0: 'not readmitted', 1:'readmitted'}
medical_specialty = {'InternalMedicine':'internal medicine doctor', 'Family/GeneralPractice': 'family doctor', 'Emergency/Trauma':'emergency doctor',
                     'Surgery-General':'surgeon', 'ObstetricsandGynecology':'ob-gyn', 'Surgery-Cardiovascular/Thoracic':'cardiovascular surgeon'}
diag_1 = {'Non-diabetes endocrine/metabolic': 'endocrine'}
age = {'[20-50)':'20 to 50', '[50-70)':'50 to 70', '[10-20)':'10 to 20', '[0-10)':'0 to 10'}

df_train['readmitted'] = df_train.apply(lambda x: readmitted[x['readmitted']], axis=1)
df_train['medical_specialty'] = df_train.apply(lambda x: medical_specialty[x['medical_specialty']] 
                                               if x['medical_specialty'] in medical_specialty else x['medical_specialty'], axis=1)
df_train['diag_1'] = df_train.apply(lambda x: diag_1[x['diag_1']] 
                                               if x['diag_1'] in diag_1 else x['diag_1'], axis=1)
df_train['age'] = df_train.apply(lambda x: age[x['age']] 
                                               if x['age'] in age else x['age'], axis=1)
df_train['time_in_hospital'] = df_train.apply(lambda x: str(int(x['time_in_hospital'])), axis=1)
df_train['num_procedures'] = df_train.apply(lambda x: str(int(x['num_procedures'])), axis=1)
df_train['num_medications'] = df_train.apply(lambda x: str(int(x['num_medications'])), axis=1)
df_train['number_outpatient'] = df_train.apply(lambda x: str(int(x['number_outpatient'])), axis=1)
df_train['number_emergency'] = df_train.apply(lambda x: str(int(x['number_emergency'])), axis=1)
# df_train['number_diagnoses'] = df_train.apply(lambda x: str(int(x['number_diagnoses'])), axis=1)

df_test['readmitted'] = df_test.apply(lambda x: readmitted[x['readmitted']], axis=1)
df_test['medical_specialty'] = df_test.apply(lambda x: medical_specialty[x['medical_specialty']] 
                                               if x['medical_specialty'] in medical_specialty else x['medical_specialty'], axis=1)
df_test['diag_1'] = df_test.apply(lambda x: diag_1[x['diag_1']] 
                                               if x['diag_1'] in diag_1 else x['diag_1'], axis=1)
df_test['age'] = df_test.apply(lambda x: age[x['age']] 
                                               if x['age'] in age else x['age'], axis=1)
df_test['time_in_hospital'] = df_test.apply(lambda x: str(int(x['time_in_hospital'])), axis=1)
df_test['num_procedures'] = df_test.apply(lambda x: str(int(x['num_procedures'])), axis=1)
df_test['num_medications'] = df_test.apply(lambda x: str(int(x['num_medications'])), axis=1)
df_test['number_outpatient'] = df_test.apply(lambda x: str(int(x['number_outpatient'])), axis=1)
df_test['number_emergency'] = df_test.apply(lambda x: str(int(x['number_emergency'])), axis=1)
# df_test['number_diagnoses'] = df_test.apply(lambda x: str(int(x['number_diagnoses'])), axis=1)
df_train.head()

Unnamed: 0,readmitted,time_in_hospital,diag_1,num_procedures,num_medications,number_outpatient,number_emergency,age,medical_specialty
0,not readmitted,2,Injury,3,27,0,1,70+,Orthopedics
1,not readmitted,4,endocrine,0,11,0,0,70+,internal medicine doctor
2,readmitted,2,Diabetes,0,15,0,3,20 to 50,unknown
3,not readmitted,4,Genitourinary,0,10,0,0,70+,unknown
4,not readmitted,3,Other,0,8,0,0,70+,unknown


In [6]:
keep_cols = ['age', 'diag_1', 'time_in_hospital', 'medical_specialty', 'num_procedures', 'num_medications', 'number_emergency', 'number_outpatient', 'readmitted']
df_train=df_train[keep_cols]
df_test =df_test[keep_cols]
prompt = [
    'The patient aged', #age
    ' received a', #diag_1
    ' diagnosis after ', #time_in_hospital
    ' days in hospital with a', #medical_specialty
    ' where they underwent', #num_procedures
    ' procedures and were prescribed', #num_medications
    ' medications. In the past year they had', #number_emergency
    ' emergency room visits', #number_outpatient
    ' outpatient appointments and were ultimately', #readmitted
    ''
]
prompt = datasets.Dataset.from_dict({'prompt': prompt})
prompt[:]

{'prompt': ['The patient aged',
  ' received a',
  ' diagnosis after ',
  ' days in hospital with a',
  ' where they underwent',
  ' procedures and were prescribed',
  ' medications. In the past year they had',
  ' emergency room visits',
  ' outpatient appointments and were ultimately',
  '']}

In [7]:
prompt_len = sum([len(chunk) for chunk in prompt])
train = datasets.Dataset.from_pandas(df_train)
train = train.map(lambda x: {'length': sum([len(col) for col in x.values()]) + prompt_len})
test = datasets.Dataset.from_pandas(df_test)
test = test.map(lambda x: {'length': sum([len(col) for col in x.values()]) + prompt_len})
train[:5]

Map:   0%|          | 0/81410 [00:00<?, ? examples/s]

Map:   0%|          | 0/20353 [00:00<?, ? examples/s]

{'age': ['70+', '70+', '20 to 50', '70+', '70+'],
 'diag_1': ['Injury', 'endocrine', 'Diabetes', 'Genitourinary', 'Other'],
 'time_in_hospital': ['2', '4', '2', '4', '3'],
 'medical_specialty': ['Orthopedics',
  'internal medicine doctor',
  'unknown',
  'unknown',
  'unknown'],
 'num_procedures': ['3', '0', '0', '0', '0'],
 'num_medications': ['27', '11', '15', '10', '8'],
 'number_emergency': ['1', '0', '3', '0', '0'],
 'number_outpatient': ['0', '0', '0', '0', '0'],
 'readmitted': ['not readmitted',
  'not readmitted',
  'readmitted',
  'not readmitted',
  'not readmitted'],
 'length': [50, 66, 49, 53, 44]}

In [8]:
from transformers import AutoTokenizer
vocab_masks = {}
for i, column in enumerate(keep_cols):
    tokenizer = AutoTokenizer.from_pretrained("/mnt/data/zoo/llama2/llama2-7b-hf")
    column=df_train[column].unique().tolist()
    tokenized = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in column]
    unique_toks = list(set(sum(tokenized, []))) + [0] # so the head can add padding too

    vocab_size = 32000
    vec = np.zeros(vocab_size, dtype=int)
    vec[unique_toks] = 1
    vocab_masks[str(i+1)] = vec
vocab_masks = datasets.Dataset.from_dict(vocab_masks)

In [9]:
split = train.train_test_split(test_size=0.1)
train, val = split['train'], split['test']
print(len(train), len(val), len(test))
ds = datasets.DatasetDict({
    'train': train,
    'eval': val,
    'test': test,
    'prompt': prompt,
    'vocab_masks': vocab_masks})
ds

73269 8141 20353


DatasetDict({
    train: Dataset({
        features: ['age', 'diag_1', 'time_in_hospital', 'medical_specialty', 'num_procedures', 'num_medications', 'number_emergency', 'number_outpatient', 'readmitted', 'length'],
        num_rows: 73269
    })
    eval: Dataset({
        features: ['age', 'diag_1', 'time_in_hospital', 'medical_specialty', 'num_procedures', 'num_medications', 'number_emergency', 'number_outpatient', 'readmitted', 'length'],
        num_rows: 8141
    })
    test: Dataset({
        features: ['age', 'diag_1', 'time_in_hospital', 'medical_specialty', 'num_procedures', 'num_medications', 'number_emergency', 'number_outpatient', 'readmitted', 'length'],
        num_rows: 20353
    })
    prompt: Dataset({
        features: ['prompt'],
        num_rows: 10
    })
    vocab_masks: Dataset({
        features: ['1', '2', '3', '4', '5', '6', '7', '8', '9'],
        num_rows: 32000
    })
})

In [10]:
ds.save_to_disk('/mnt/data/sonia/datasets/diabetes/may10.dat')

Saving the dataset (0/1 shards):   0%|          | 0/73269 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32000 [00:00<?, ? examples/s]