In [None]:
!pip install -q transformers accelerate

Table of Contents:
1. Setup Environment
2. Imports
3. Constants
4. Read Data
5. EDA
6. Dataset and Dataloaders
7. Model
8. Training loop
9. Validation
10. Inference
11. Submission Generation

In [None]:
import os
import copy
import time
from datetime import timedelta
import random
import string
import wandb
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import AutoTokenizer, DataCollatorWithPadding, get_scheduler, \
    AutoModelForSequenceClassification

colors = ['#9B5DE5', '#F15BB5', '#FEE440', '#00BBF9', '#00F5D4']
sns.palplot(sns.color_palette(colors))
sns.set_palette(colors)

class color:
    '''S from Start & E from End.'''
    S = '\033[1m' + '\033[93m'
    E = '\033[0m'

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
config = {
    # data paths
    'TRAIN_DIR': os.path.abspath('../input/feedback-prize-effectiveness/train'),
    'TEST_DIR': os.path.abspath('../input/feedback-prize-effectiveness/test'),
    'TRAIN_DATA_PATH': os.path.abspath('../input/feedback-prize-effectiveness/train.csv'),
    'TEST_DATA_PATH': os.path.abspath('../input/feedback-prize-effectiveness/test.csv'),
    'SAMPLE_SUMISSION_PATH': os.path.abspath('../input/feedback-prize-effectiveness/sample_submission.csv'),
    
    # pre-processing
    'checkpoint': 'microsoft/deberta-v3-base',
    'max_length': 512,
    'num_classes': 3,
    'device': torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    
    'batch_size': 8,
    'seed': 42,
    'num_epochs': 5,
    
    'use_kfold': False,
    'use_wandb': False,
    'hash_name': "".join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(12))
}

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(config['seed'])


In [None]:
if config['use_wandb']:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("WANDB_API_KEY")
    wandb.login(key=api_key)

In [None]:
df = pd.read_csv(config['TRAIN_DATA_PATH'])
test_df = pd.read_csv(config['TEST_DATA_PATH'])
display(df.head())
display(test_df.head())

In [None]:
label_to_id = {}
id_to_label = {}

type_to_id = {}
id_to_type = {}

for idx, para_label in enumerate(df['discourse_effectiveness'].unique()):
    label_to_id[para_label] = idx
    id_to_label[idx] = para_label

for idx, para_type in enumerate(df['discourse_type'].unique()):
    type_to_id[para_type] = idx
    id_to_type[idx] = para_type

print(label_to_id)
print(type_to_id)

df['discourse_effectiveness'] = df['discourse_effectiveness'].replace(label_to_id)
df['discourse_type'] = df['discourse_type'].replace(type_to_id)
test_df['discourse_type'] = test_df['discourse_type'].replace(type_to_id)

display(df.head(10))
display(test_df.head())

In [None]:
type_distribution = df['discourse_effectiveness'].value_counts()
sns.barplot(x=type_distribution.values, y=list(id_to_label.values()), palette=colors, saturation=0.7)
plt.title('Discourse Effectiveness Distribution')
plt.xlabel('Frequency')
plt.ylabel('Discourse Effectiveness')
plt.show()

In [None]:
discourse_type_distribution = df['discourse_type'].value_counts()
sns.barplot(
    x=discourse_type_distribution.values,
    y=list(id_to_type.values()),
    palette=colors, saturation=0.7
)
plt.title('Discourse Type Distribution')
plt.xlabel('Frequency')
plt.ylabel('Discourse Type')
plt.show()

In [None]:
class FeedbackDataset(Dataset):
    
    def __init__(self, df, tokenizer, is_test=False):
        self.df = df
        self.is_test = is_test
        self.preprocessed_text = tokenizer(
            df['discourse_text'].values.tolist(),
            padding = True,
            truncation = True,
            add_special_tokens=True,
            max_length = config['max_length'],
            return_tensors = 'pt'
        )
    
    def __len__(self):
        return len(self.preprocessed_text)
    
    def __getitem__(self, idx):
        if self.is_test:
            return {
                "attention_mask": self.preprocessed_text["attention_mask"][idx],
                "input_ids": self.preprocessed_text["input_ids"][idx]
            }
        else:
            return {
                "attention_mask": self.preprocessed_text["attention_mask"][idx],
                "input_ids": self.preprocessed_text["input_ids"][idx],
                "labels": torch.tensor(self.df['discourse_effectiveness'].values[idx])
            }

In [None]:
# for batch in dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config['checkpoint'], 
                                                           num_labels=len(id_to_type.keys())).to(config['device'])

outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
def step(model, dataset, dataloader, criterion, optimizer, scheduler=None, mode='train'):
    if mode == 'train':
        model.train()
    else:
        model.eval()
    
    running_loss = 0.0
    running_correct = 0
    total = 0
    
    for data in tqdm(dataloader, total=len(dataloader)):
        input_ids = data['input_ids'].to(config['device'])
        attention_masks = data['attention_mask'].to(config['device'])
        targets = data['labels'].to(config['device'])
        
        outputs = model(input_ids, attention_masks)
        
        loss = criterion(outputs.logits, targets)
        running_loss += loss.item()
        loss.backward()
        
        _, preds = torch.max(outputs.logits, 1)
        running_correct += (preds == targets).sum().item()
        total += targets.size(0)

        # adjust parameters based on the calculated gradients
        if mode == 'train':
            optimizer.step()
            optimizer.zero_grad()
        
            if scheduler is not None:
                scheduler.step()

    loss = running_loss/total
    accuracy = 100. * running_correct/total
    
    #gc.collect()

    return loss, accuracy

In [None]:
if config['use_wandb']:
    run = wandb.init(
        project='Feedback Effectiveness Competition',
        entity="raghavprabhakar",
        config=config,
        job_type='Train'
    )

In [None]:
if not config['use_kfold']:
    train_df, valid_df = train_test_split(df, test_size=0.2, random_state=config['seed'])
    print(train_df.shape)
    print(valid_df.shape)

tokenizer = AutoTokenizer.from_pretrained(config['checkpoint'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataset = FeedbackDataset(train_df, tokenizer)
valid_dataset = FeedbackDataset(valid_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=data_collator)
valid_dataloader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=data_collator)

criterion = nn.CrossEntropyLoss()

num_training_steps = config['num_epochs'] * len(train_dataloader)

optimizer = AdamW(model.parameters(), lr=3e-4)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
history = {
    'train_loss': [],
    'train_acc': [], 
    'valid_loss': [],
    'valid_acc': []
}

trainig_start_time = time.time()
best_model_weights = copy.deepcopy(model.state_dict())
best_epoch_loss = np.inf

for epoch in range(config['num_epochs']):
    train_epoch_loss, train_epoch_acc = step(model, train_dataset, train_dataloader, criterion, optimizer, lr_scheduler, mode='train')
    valid_epoch_loss, valid_epoch_acc = step(model, valid_dataset, valid_dataloader, criterion, optimizer, lr_scheduler, mode='valid')
    
    print('Train Loss: {:.5f} | Train Accuracy: {:.5f}'.format(train_epoch_loss, train_epoch_acc))
    print('Valid Loss: {:.5f} | Valid Accuracy: {:.5f}'.format(train_epoch_loss, train_epoch_acc))
    
    history['train_loss'].append(train_epoch_loss)
    history['train_acc'].append(train_epoch_acc)
    history['valid_loss'].append(valid_epoch_loss)
    history['valid_acc'].append(valid_epoch_loss)
    
    if config['use_wandb']:
        wandb.log({
            "Train Loss": train_epoch_loss,
            "Valid Loss": valid_epoch_loss,
            "Train Accuracy": train_epoch_acc,
            "Valid Accuracy": valid_epoch_acc
        })
        wandb.watch(model)
    
    if valid_epoch_loss <= best_epoch_loss:
        print(f"Validation Loss Improved ({best_epoch_loss} ---> {valid_epoch_loss})")
        best_epoch_loss = valid_epoch_loss
        run.summary["Best Loss"] = best_epoch_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        PATH = f"deberta-epoch-{epoch}.pt"
        torch.save(model.state_dict(), PATH)
        print(f"Model Saved\n")

training_end_time = time.time()
training_duration = training_end_time - trainig_start_time

print(f'Training complete in {timedelta(training_duration)}')
print("Best Loss: {:.4f}".format(best_epoch_loss))

if config['use_wandb']: run.finish()