In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch, datasets, transformers
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm


2024-04-16 13:47:35.145136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
if torch.cuda.is_available():
  dev = "cuda"
else:
  dev = "cpu"
device = torch.device(dev)

# device = torch.device('cpu')
print(device)

cuda


In [4]:
le = LabelEncoder()
class_keys = ["Garnett", "McDuff", "PV", "Katz", "Hogarth"]
le.fit(["Garnett", "McDuff", "PV", "Katz", "Hogarth"])
print(le.transform(["Garnett", "McDuff", "PV", "Katz", "Hogarth"]))

[0 3 4 2 1]


In [5]:
train_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/aligned_train_df.pickle")  
test_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_test_df.pickle") 
val_df = pd.read_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_val_df.pickle") 

sentences = {}
sentences['train'] = [{'label': row['labels'], 'text':row['concat']} for i, row in train_df.iterrows()]
sentences['test'] = [{'label': row['labels'], 'text':row['concat']} for i, row in test_df.iterrows()]
sentences['val'] = [{'label': row['labels'], 'text':row['concat']} for i, row in val_df.iterrows()]


print('train size: ', len(sentences['train']))
print('val size: ', len(sentences['val']))
print('test size: ', len(sentences['test']))

train size:  17735
val size:  1175
test size:  1175


In [6]:
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['train']))
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['val']))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['test']))

In [7]:
BERT_MODEL = "bert-base-multilingual-cased"

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [8]:
transformers.logging.set_verbosity_error()

def preprocess_function(datum):
    src, tgt = datum['text'].split(' <SEP> ')
    return tokenizer(src, tgt, 
                     padding='max_length', 
                     max_length=512, 
                     truncation='longest_first',
                     add_special_tokens=True
                     )

tokenized_train = train_dataset.map(preprocess_function)
tokenized_val = val_dataset.map(preprocess_function)
tokenized_test = test_dataset.map(preprocess_function)

Map:   0%|          | 0/17735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

In [9]:
tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

tokenized_val = tokenized_val.rename_column('label', 'labels')
tokenized_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

tokenized_test = tokenized_test.rename_column('label', 'labels')
tokenized_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [10]:
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=12)
val_dataloader = DataLoader(tokenized_val, batch_size=12)
test_dataloader = DataLoader(tokenized_test, batch_size=12)

In [11]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = len(class_keys))
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [12]:
epochs = 5
lr = 2e-5

In [15]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), 
                eps = 1e-8,
                lr = lr,
                weight_decay = 0.01)

total_steps = len(train_dataloader) * epochs                                          

In [16]:
import wandb

run = wandb.init(
        # Set the project where this run will be logged
        project= 'pytorch-classification',
        name = 'run1',
        # Track hyperparameters and run metadata
        config={
            "learning_rate": lr,
            "epochs": epochs,
        },
    )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkkatsy[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
def get_eval(model, the_dataloader):
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in tqdm(the_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
            
        input_id_tensors = batch['input_ids']
        input_mask_tensors = batch['attention_mask']
        label_tensors = batch['labels']

        # Move tensors to the CPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)
        
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            
            loss = outputs.loss
            logits = outputs.logits
                
            # Accumulate the validation loss.
            total_eval_loss += loss.item()
            
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            num_correct = np.sum(pred_flat == labels_flat)
            total_correct += num_correct
            total_samples += batch['labels'].size(0)
        
    # Report the final accuracy/loss for this validation run.
    avg_acc = total_correct / total_samples
    avg_loss = total_eval_loss / len(the_dataloader)

    return avg_loss, avg_acc

In [20]:
# Get init values
print('Epoch 0: ')
train_loss, train_acc = get_eval(model, train_dataloader)
print(f"Train accuracy: {train_acc:.4f}, Train loss: {train_loss:.4f}")
val_loss, val_acc = get_eval(model, val_dataloader)
print(f"Validation accuracy: {val_acc:.4f}, Validation loss: {val_loss:.4f}")
run.log({"train_accuracy": train_acc, 'train_loss': train_loss})
run.log({"val_accuracy": val_acc, 'val_loss': val_loss})
print("")

for epoch in range(epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0
    total_correct = 0
    total_samples = 0
    # Put the model into training mode.
    model.train()
    
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        input_id_tensors = batch['input_ids']
        input_mask_tensors = batch['attention_mask']
        label_tensors = batch['labels']

        # Move tensors to the CPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct
        total_samples += batch['labels'].size(0)

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()
        
    train_acc = total_correct / total_samples
    train_loss = total_train_loss / len(train_dataloader)
    print(f"Train accuracy: {train_acc:.4f}, Train loss: {train_loss:.4f}")
    val_loss, val_acc = get_eval(model, val_dataloader)
    print(f"Validation accuracy: {val_acc:.4f}, Validation loss: {val_loss:.4f}")
    
    run.log({"train_accuracy": train_acc, 'train_loss': train_loss})
    run.log({"val_accuracy": val_acc, 'val_loss': val_loss})

Epoch 0: 


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.2001, Train loss: 1.6263


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.1974, Validation loss: 1.6425


Training...


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.5464, Train loss: 1.0607


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.2417, Validation loss: 1.5565

Training...


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.7638, Train loss: 0.5635


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.3753, Validation loss: 1.4614

Training...


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.8169, Train loss: 0.4380


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.4315, Validation loss: 1.4572

Training...


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.8539, Train loss: 0.3473


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.4383, Validation loss: 1.5134

Training...


  0%|          | 0/1478 [00:00<?, ?it/s]

Train accuracy: 0.8785, Train loss: 0.2918


  0%|          | 0/98 [00:00<?, ?it/s]

Validation accuracy: 0.4783, Validation loss: 1.4967


In [21]:
avg_test_loss, avg_test_accuracy = get_eval(model=model, the_dataloader=test_dataloader)
print(f'Test Accuracy: {avg_test_accuracy:.4f}, Test Loss: {avg_test_loss:.4f}')
run.log({"test_accuracy": avg_test_accuracy, 'test_loss': avg_test_loss})

  0%|          | 0/98 [00:00<?, ?it/s]

Test Accuracy: 0.4562, Test Loss: 1.5773


In [22]:
wandb.finish()



VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_accuracy,▁
test_loss,▁
train_accuracy,▁▆▇▇█
train_loss,█▃▂▂▁
val_accuracy,▁▅▇▇█
val_loss,█▁▁▅▄

0,1
test_accuracy,0.45617
test_loss,1.57732
train_accuracy,0.87855
train_loss,431.30149
val_accuracy,0.4783
val_loss,1.49674
