In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.optim.lr_scheduler import StepLR
import pandas as pd
import json
import time
import os

# Load dataset from jsonl files
def load_jsonl_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return pd.DataFrame(data)

# Load datasets
train_data = load_jsonl_data('/kaggle/input/mgt-classification/en_train.jsonl')
dev_data = load_jsonl_data('/kaggle/input/mgt-classification/en_dev.jsonl')

# Combine train and dev data
df = pd.concat([train_data, dev_data])

# Select the first 30,000 samples for training
df = df.sample(n=30000, random_state=42)

# Prepare Dataset class for PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',  
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Custom collate function to dynamically pad the batches
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = torch.stack([item['labels'] for item in batch])
    
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

# Split the first 30,000 samples into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values, df['label'].values, test_size=0.2, random_state=42
)

# Set up tokenizer and model - Using DistilBERT for faster performance
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set dataset parameters
MAX_LEN = 128  # Reduced max length for faster training
BATCH_SIZE = 8  # Smaller batch size
accumulation_steps = 4  # Gradient accumulation

# Prepare Dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

# Class imbalance handling (optional)
class_weights = torch.tensor([0.7, 1.3]).to(model.device)

# Training function
def train_model(model, train_loader, val_loader, epochs=1, learning_rate=2e-5, weight_decay=0.01):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # Learning rate scheduler

    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    best_f1 = 0
    best_model = None

    for epoch in range(epochs):
        model.train()
        total_loss_train = 0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0] / accumulation_steps  # Scale loss for gradient accumulation

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss_train += loss.item()

        scheduler.step()

        epoch_time = time.time() - start_time
        print(f'Epoch {epoch+1} completed in {epoch_time // 60:.0f} minutes {epoch_time % 60:.0f} seconds.')
        print(f'Epoch {epoch+1}, Train loss: {total_loss_train/len(train_loader)}')

        # Validation phase
        model.eval()
        predictions, true_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs[0]
                preds = torch.argmax(logits, dim=1)

                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        val_acc = accuracy_score(true_labels, predictions)
        val_f1 = f1_score(true_labels, predictions)
        print(f'Epoch {epoch+1}, Validation Accuracy: {val_acc}, F1 Score: {val_f1}')

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = model.state_dict()
            print(f"Best F1 Score improved to {best_f1}, saving model...")

    # Load best model weights
    model.load_state_dict(best_model)
    return model

# Train the model on the first 30k samples
model = train_model(model, train_loader, val_loader, epochs=1)

# Ensure the save directory exists
save_directory = 'distilbert_model_phase_1'
os.makedirs(save_directory, exist_ok=True)

# Save the trained model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Model saved after training on the first 30k samples.")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…


Epoch 1 completed in 149 minutes 51 seconds.
Epoch 1, Train loss: 0.12503045760001988
Epoch 1, Validation Accuracy: 0.8115, F1 Score: 0.8563809523809525
Best F1 Score improved to 0.8563809523809525, saving model...
Model saved after training on the first 30k samples.
