In [1]:
# hyperparameters
datapath = './adult.csv'
savedir = './ckpts/fred'
learning_rate = 5e-4
batch_size = 16
max_length = 160
epochs = 1
num_samples = 100

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import transformers
from torch import nn
import torch
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from matplotlib import pyplot as plt
from tqdm import tqdm 
import re
import os
if os.path.isdir(savedir):
    os.makedirs(savedir, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


## Training

In [3]:
model = transformers.AutoModelForCausalLM.from_pretrained('distilgpt2')
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
special_tokens_dict = {"bos_token": "<BOS>", 'eos_token': '<EOS>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

def row_to_col_sentences(row):
    return [str(col).strip() + " is " + str(val).strip() + '.<EOS>' for col, val in zip(row.index, row.values)]

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=100):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = 100

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = row_to_col_sentences(data.iloc[idx])
        
        text = tokenizer.bos_token + ''.join(text)
        print(text)
        tokenized_text = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        return tokenized_text.input_ids.squeeze(), tokenized_text.attention_mask.squeeze()
            
            
# Load the dataset
data = pd.read_csv(datapath)
text_data = data.apply(row_to_col_sentences, axis=1).tolist()
dataset = TextDataset(text_data, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



In [4]:
model.train()
# Move the model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.resize_token_embeddings(len(tokenizer))


optimizer = AdamW(model.parameters(), lr=learning_rate)
losses = []

for epoch in range(epochs): 
    for batch in tqdm(dataloader):
        # print(batch)
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
    torch.save(model.state_dict(), os.path.join(savedir, f'model-e{epoch}.pt'))
        
plt.plot(losses)

  0%|          | 0/3053 [00:00<?, ?it/s]

<BOS>age is 43.<EOS>workclass is Private.<EOS>fnlwgt is 96249.<EOS>education is HS-grad.<EOS>education-num is 9.<EOS>marital-status is Married-civ-spouse.<EOS>occupation is Machine-op-inspct.<EOS>relationship is Husband.<EOS>race is White.<EOS>sex is Male.<EOS>capital-gain is 0.<EOS>capital-loss is 0.<EOS>hours-per-week is 42.<EOS>native-country is United-States.<EOS>income is >50K.<EOS>
<BOS>age is 48.<EOS>workclass is Private.<EOS>fnlwgt is 175468.<EOS>education is HS-grad.<EOS>education-num is 9.<EOS>marital-status is Married-spouse-absent.<EOS>occupation is Other-service.<EOS>relationship is Unmarried.<EOS>race is White.<EOS>sex is Female.<EOS>capital-gain is 0.<EOS>capital-loss is 0.<EOS>hours-per-week is 16.<EOS>native-country is United-States.<EOS>income is <=50K.<EOS>
<BOS>age is 29.<EOS>workclass is Private.<EOS>fnlwgt is 213152.<EOS>education is 11th.<EOS>education-num is 7.<EOS>marital-status is Divorced.<EOS>occupation is Craft-repair.<EOS>relationship is Not-in-family.<EOS

  0%|          | 1/3053 [00:00<29:12,  1.74it/s]


<BOS>age is 42.<EOS>workclass is Private.<EOS>fnlwgt is 121352.<EOS>education is Bachelors.<EOS>education-num is 13.<EOS>marital-status is Married-civ-spouse.<EOS>occupation is Prof-specialty.<EOS>relationship is Husband.<EOS>race is White.<EOS>sex is Male.<EOS>capital-gain is 0.<EOS>capital-loss is 0.<EOS>hours-per-week is 80.<EOS>native-country is ?.<EOS>income is >50K.<EOS>
<BOS>age is 40.<EOS>workclass is Private.<EOS>fnlwgt is 369781.<EOS>education is HS-grad.<EOS>education-num is 9.<EOS>marital-status is Married-civ-spouse.<EOS>occupation is Handlers-cleaners.<EOS>relationship is Husband.<EOS>race is White.<EOS>sex is Male.<EOS>capital-gain is 15024.<EOS>capital-loss is 0.<EOS>hours-per-week is 45.<EOS>native-country is United-States.<EOS>income is >50K.<EOS>
<BOS>age is 17.<EOS>workclass is Private.<EOS>fnlwgt is 318918.<EOS>education is 10th.<EOS>education-num is 6.<EOS>marital-status is Never-married.<EOS>occupation is Farming-fishing.<EOS>relationship is Own-child.<EOS>race i

OutOfMemoryError: CUDA out of memory. Tried to allocate 304.00 MiB. GPU 

## Sampling

In [None]:
%%capture
samples = []
for i in range(num_samples):
    # tokenizer.batch_decode(model.generate(max_length=max_length))[0] # search
    samp = tokenizer.batch_decode(model.generate(do_sample=True, num_beams=1, max_length=max_length))[0] #sample
    samples.append(samp)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` t

In [None]:
samples

['<|endoftext|>age is 34.<EOS>workclass is Private.<EOS>fnlwgt is 323827.<EOS>education is HS-grad.<EOS>education-num is 9.<EOS>marital-status is Never-married.<EOS>occupation is Handlers-cleaners.<EOS>relationship is Not-in-family.<EOS>race is White.<EOS>sex is Male.<EOS>capital-gain is 0.<EOS>capital-loss is 0.<EOS>hours-per-week is 50.<EOS>native-country is United-States.<EOS>income is 0.<EOS>native-country is United-States.<EOS>income is 0.<EOS>capital-loss is 0.<EOS>hours-per-week is',
 '<|endoftext|>age is 62.<EOS>workclass is Private.<EOS>fnlwgt is 214616.<EOS>education is Some-college.<EOS>education-num is 10.<EOS>marital-status is Married-civ-spouse.<EOS>occupation is Other-service.<EOS>relationship is Husband.<EOS>race is White.<EOS>sex is Male.<EOS>capital-gain is 7298.<EOS>capital-loss is 0.<EOS>hours-per-week is 40.<EOS>native-country is United-States.<EOS>income is 0.<EOS>capital-States.<EOS>income is 0.<EOS>native-country is 7298.<EOS>income is 0.<EOS>native-loss is',
 '

In [None]:
# parsing
pattern = r'^\<\|endoftext\|\>age is (\?|\d+)\.<EOS>workclass is (.+)\.<EOS>fnlwgt is (\?|\d+)\.<EOS>education is (.+)\.<EOS>' +\
             r'education-num is (\?|\d+)\.<EOS>marital-status is (.+)\.<EOS>occupation is (.+)\.<EOS>relationship is (.+)\.<EOS>' +\
             r'race is (.+)\.<EOS>sex is (.+)\.<EOS>capital-gain is (\?|\d+)\.<EOS>capital-loss is (\?|\d+)\.<EOS>' +\
             r'hours-per-week is (\?|\d+)\.<EOS>native-country is ([-\w]+)\.<EOS>income is (.+?)\.<EOS>.?'
parsed = []
for raw in samples:
    vals = re.findall(pattern, raw)
    if len(vals) == 1:
        parsed.append(vals[0])
len(parsed)

92

In [None]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship',
        'race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
df = pd.DataFrame(parsed, columns=cols)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,34,Private,323827,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,0
1,62,Private,214616,Some-college,10,Married-civ-spouse,Other-service,Husband,White,Male,7298,0,40,United-States,0
2,32,Private,94784,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0
3,40,Private,214511,Prof-school,15,Married-civ-spouse,Prof-specialty,Other-relative,White,Male,0,1672,40,United-States,0
4,35,Private,135838,11th,7,Divorced,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,0


In [None]:
df['income'].unique()

array(['0', 'United-States', '<= 65', '4865', '1768', '4064', '<= 6521',
       '4450', '29', '3325', '15024', '1580', '4550', '25', '2', '4047',
       '3464', '84', '584', '391', '3818', '7688', '7298', '7', '3929',
       '4150', '20', '4236', '15073', '2559', '40', 'Farming-States',
       '2051', '4650', '2577'], dtype=object)

## ML Efficacy