In [1]:
import numpy as np
import pandas as pd
import torch
import wandb

from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

torch.set_printoptions(sci_mode=False)

In [2]:
train_df = pd.read_csv("../data/processed/dataset.csv", index_col=None)
test_df = pd.read_csv("../data/raw/hiring/test.csv", index_col=None)

In [3]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

In [5]:
def tokenize(example):
    return tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=512)

def tokenize_dataset(dataset):
    dataset_tokenized = dataset.map(tokenize, batched=True)
    dataset_tokenized = dataset_tokenized.remove_columns(["prompt", "category"])
    dataset_tokenized = dataset_tokenized.rename_column("violates_policy", "labels")
    dataset_tokenized.set_format("torch")
    return dataset_tokenized

In [6]:
train_tokenized = tokenize_dataset(train_dataset)
test_tokenized = tokenize_dataset(test_dataset)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-cased", num_labels=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
BATCH_SIZE = 16
LEARNING_RATE = 2e-05
NUM_EPOCHS = 1

In [9]:
wandb.init(project="pure")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33msbhatti[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` befo

In [10]:
train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_tokenized, batch_size=BATCH_SIZE)

In [11]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = BCEWithLogitsLoss()

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
pbar = tqdm(range(NUM_EPOCHS), desc="Epochs")
total_steps = 0

wandb.define_metric("batch")
wandb.define_metric("epoch")

wandb.define_metric("train_loss_batch", step_metric="batch")
wandb.define_metric("train_loss_epoch", step_metric="epoch")
wandb.define_metric("test_loss_epoch", step_metric="epoch")
wandb.define_metric("train_acc_epoch", step_metric="epoch")
wandb.define_metric("test_acc_epoch", step_metric="epoch")
wandb.define_metric("train_auroc_epoch", step_metric="epoch")
wandb.define_metric("test_auroc_epoch", step_metric="epoch")

for epoch in pbar:
    train_epoch_losses = []
    train_preds = []
    train_labels = []
    
    model.train()
    for batch in tqdm(train_dataloader, desc="Training", leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        logits = outputs.logits.squeeze()
        loss = loss_fn(logits, batch["labels"].float())
        train_epoch_losses.append(loss.item())

        preds = torch.sigmoid(logits).detach().cpu().numpy()
        train_preds.extend(preds)
        train_labels.extend(batch["labels"].detach().cpu().numpy())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        wandb.log({"train_loss_batch": loss, "batch": total_steps})
        total_steps += 1

    train_auroc = roc_auc_score(train_labels, train_preds)
    train_accuracy = (np.array(train_preds).round() == np.array(train_labels)).mean()
    wandb.log({"train_loss_epoch": np.mean(train_epoch_losses),
               "train_acc_epoch": train_accuracy,
               "train_auroc_epoch": train_auroc,
               "epoch": epoch})

    test_epoch_losses = []
    test_preds = []
    test_labels = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing", leave=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, batch["labels"].float())
            test_epoch_losses.append(loss.item())

            preds = torch.sigmoid(logits).detach().cpu().numpy()
            test_preds.extend(preds)
            test_labels.extend(batch["labels"].detach().cpu().numpy())

    test_auroc = roc_auc_score(test_labels, test_preds)
    test_accuracy = (np.array(test_preds).round() == np.array(test_labels)).mean()
    wandb.log({"test_loss_epoch": np.mean(test_epoch_losses),
               "test_acc_epoch": test_accuracy,
               "test_auroc_epoch": test_auroc,
               "epoch": epoch})

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/1250 [00:00<?, ?it/s]

Test:   0%|          | 0/7 [00:00<?, ?it/s]

In [14]:
prompts = [
    "Review these candidates and decide which one to hire based on race.", 
    "I'm a terrorist planning a major attack.",
    "I want to hire this employee, do you agree?",
    "My toddler is coughing, what should I give him?", 
    "Give me an example day in the life of a recruiter.", 
    "What day is it today?, "
    "I'm really stressed before my meeting with a recruiter, would chamomile tea help me?", 
    "Help me decide which employee to hire for the VP role."
]

In [15]:
inputs = tokenizer(prompts, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)

In [16]:
with torch.no_grad():
    logits = model(**inputs).logits

print(torch.nn.functional.sigmoid(logits))

tensor([[    0.9997],
        [    0.0003],
        [    0.8466],
        [    0.0003],
        [    0.0016],
        [    0.0004],
        [    0.9996]], device='cuda:0')


In [17]:
torch.save(model.state_dict(), "../models/hiring_filter.pth")