In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch

if torch.cuda.is_available():
  dev = "cuda"
else:
  dev = "cpu"
device = torch.device(dev)

# device = torch.device('cpu')
print(device)

cuda


In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
class_keys = ["Garnett", "McDuff", "PV", "Katz", "Hogarth"]
le.fit(["Garnett", "McDuff", "PV", "Katz", "Hogarth"])
print(le.transform(["Garnett", "McDuff", "PV", "Katz", "Hogarth"]))

[0 3 4 2 1]


In [4]:
import pandas as pd 

train_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/aligned_train_df.pickle")  
test_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_test_df.pickle") 
val_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_val_df.pickle") 

sentences = {}
sentences['train'] = [{'label': row['labels'], 'text':row['concat']} for i, row in train_df.iterrows()]
sentences['test'] = [{'label': row['labels'], 'text':row['concat']} for i, row in test_df.iterrows()]
sentences['val'] = [{'label': row['labels'], 'text':row['concat']} for i, row in val_df.iterrows()]


print('train size: ', len(sentences['train']))
print('val size: ', len(sentences['val']))
print('test size: ', len(sentences['test']))

train size:  (17735, 8)
val size:  (1175, 8)
test size:  (1175, 8)
train size:  17735
val size:  1175
test size:  1175


In [27]:
import datasets

train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['train']))
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['val']))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['test']))

In [28]:
BERT_MODEL = "bert-base-multilingual-cased"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [29]:
def preprocess_function(datum):
    src, tgt = datum['text'].split(' <SEP> ')
    return tokenizer(src, tgt, 
                     padding='max_length', 
                     max_length=512, 
                     truncation='longest_first',
                     add_special_tokens=True
                     )

tokenized_train = train_dataset.map(preprocess_function)
tokenized_val = val_dataset.map(preprocess_function)
tokenized_test = test_dataset.map(preprocess_function)

Map:   0%|          | 0/17735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

In [None]:
tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

tokenized_val = tokenized_val.rename_column('label', 'labels')
tokenized_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

tokenized_test = tokenized_test.rename_column('label', 'labels')
tokenized_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [66]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=12)
val_dataloader = DataLoader(tokenized_val, batch_size=12)
test_dataloader = DataLoader(tokenized_test, batch_size=12)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = len(class_keys))
model.to(device)

In [None]:
model

In [42]:
from torch.optim import AdamW

lr = 5e-6
optimizer = AdamW(model.parameters(), 
                  lr = lr, # args.learning_rate - default is 5e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8
                  weight_decay = 0.1
                  )

In [44]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

In [None]:
# START-UP WANDB

import wandb

run = wandb.init(
        # Set the project where this run will be logged
        project= '',
        name = '',
        # Track hyperparameters and run metadata
        config={
            "learning_rate": lr,
            "epochs": num_epochs,
        },
    )

In [45]:
from tqdm.auto import tqdm


progress_bar = tqdm(range(num_training_steps))

# freeze
for param in model.bert.parameters():
    param.requires_grad = False
            
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2956 [00:00<?, ?it/s]

In [68]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.6}

In [None]:
# torch.save(model, PATH)

## CLOSE WANDB
wandb.finish()