In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load data
data_df = pd.read_csv('/home/da23c014/ISEC/dataset/combined_data.csv')

# Basic preprocessing
data_df['cleaned_title'] = data_df['cleaned_title'].fillna('')
data_df['cleaned_description'] = data_df['cleaned_description'].fillna('')
data_df['Component'] = data_df['Component'].astype(str)
data_df['Status'] = data_df['Status'].astype(str)
data_df['Resolution'] = data_df['Resolution'].astype(str)

# Separate train/valid based on Usage
train_df = data_df[data_df['Usage'] == 'Train'].copy()
valid_df = data_df[data_df['Usage'] == 'Valid'].copy()

print("Train size:", len(train_df), "Validation size:", len(valid_df))

# Class distribution
print("Class Distribution (Train):")
print(train_df['Priority'].value_counts())


Using device: cuda
Train size: 42395 Validation size: 10599
Class Distribution (Train):
Priority
2.0    24923
1.0     7530
0.0     7154
3.0     1830
4.0      958
Name: count, dtype: int64


In [2]:
# Prepare tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large', do_lower_case=True)
max_len = 512

def combine_text(row):
    # We focus on text: use title + [SEP] + description
    return row['cleaned_title'] + " [SEP] " + row['cleaned_description']

train_df['text_input'] = train_df.apply(combine_text, axis=1)
valid_df['text_input'] = valid_df.apply(combine_text, axis=1)

# Tokenize
def encode_texts(texts):
    input_ids = []
    attention_masks = []
    for sent in texts:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_input_ids, train_attention_masks = encode_texts(train_df['text_input'].tolist())
valid_input_ids, valid_attention_masks = encode_texts(valid_df['text_input'].tolist())

# Labels
train_labels = torch.tensor(train_df['Priority'].values, dtype=torch.long)
valid_labels = torch.tensor(valid_df['Priority'].values, dtype=torch.long)

# Categorical features
components = list(data_df['Component'].unique())
status_vals = list(data_df['Status'].unique())
resolution_vals = list(data_df['Resolution'].unique())

component2idx = {c: i for i, c in enumerate(components)}
status2idx = {s: i for i, s in enumerate(status_vals)}
resolution2idx = {r: i for i, r in enumerate(resolution_vals)}

def encode_cat(df):
    cat_component = df['Component'].map(component2idx).values
    cat_status = df['Status'].map(status2idx).values
    cat_resolution = df['Resolution'].map(resolution2idx).values
    return torch.tensor(np.stack([cat_component, cat_status, cat_resolution], axis=1), dtype=torch.long)

train_cat_feats = encode_cat(train_df)
valid_cat_feats = encode_cat(valid_df)

# Numeric features
# Using word_count and description_word_count
train_num = train_df[['word_count', 'description_word_count']].fillna(0).values.astype(np.float32)
valid_num = valid_df[['word_count', 'description_word_count']].fillna(0).values.astype(np.float32)

# Normalize numeric features based on train stats
num_mean = train_num.mean(axis=0)
num_std = train_num.std(axis=0) + 1e-8
train_num = (train_num - num_mean) / num_std
valid_num = (valid_num - num_mean) / num_std

train_num_feats = torch.tensor(train_num, dtype=torch.float32)
valid_num_feats = torch.tensor(valid_num, dtype=torch.float32)

# Create Datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_cat_feats, train_num_feats, train_labels)
valid_dataset = TensorDataset(valid_input_ids, valid_attention_masks, valid_cat_feats, valid_num_feats, valid_labels)


In [3]:
# Class distribution (train) for weighting
# Frequencies provided:
# 2:31154, 1:9412, 0:8943, 3:2287, 4:1198
freqs = {0:8943, 1:9412, 2:31154, 3:2287, 4:1198}
# Let's do inverse frequency normalized:
class_weights = []
for c in range(5):
    class_weights.append(1.0 / freqs[c])
class_weights = np.array(class_weights)
class_weights = class_weights / class_weights.sum()  # normalize
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

import torch.nn as nn
from transformers import RobertaModel

class MultiModalRoberta(nn.Module):
    def __init__(self, 
                 roberta_model_name='roberta-large', 
                 num_labels=5, 
                 cat_vocab_sizes=[len(components), len(status_vals), len(resolution_vals)],
                 cat_emb_dim=32,
                 num_numeric=2, 
                 hidden_size=1024, # roberta-large hidden size
                 dropout_prob=0.1):
        super(MultiModalRoberta, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        
        # Embeddings for categorical features
        self.cat_embeddings = nn.ModuleList([nn.Embedding(vocab_size, cat_emb_dim) for vocab_size in cat_vocab_sizes])
        cat_total_dim = cat_emb_dim * len(cat_vocab_sizes)
        
        # A small MLP for numeric features
        self.num_mlp = nn.Sequential(
            nn.Linear(num_numeric, 32),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        # Final classifier layer
        # Input: CLS (1024) + cat_embs (cat_total_dim) + num_embs (32)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_size + cat_total_dim + 32, num_labels)
        )
        
    def forward(self, input_ids, attention_mask, cat_feats, num_feats):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:,0,:]  # [batch, hidden_size]
        
        cat_embs = []
        for i, emb_layer in enumerate(self.cat_embeddings):
            cat_embs.append(emb_layer(cat_feats[:,i]))
        cat_embs = torch.cat(cat_embs, dim=1) # [batch, cat_total_dim]
        
        num_embs = self.num_mlp(num_feats) # [batch, 32]
        
        combined = torch.cat([cls_emb, cat_embs, num_embs], dim=1)
        logits = self.classifier(combined)
        return logits

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction
        
    def forward(self, logits, targets):
        logits = logits.float()
        targets = targets.long()
        
        ce_loss = F.cross_entropy(logits, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt)**self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


In [None]:
batch_size = 20

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    valid_dataset,
    sampler=SequentialSampler(valid_dataset),
    batch_size=batch_size
)

model = MultiModalRoberta()
model.to(device)

# Define focal loss with class weights
loss_fn = FocalLoss(gamma=2.0, alpha=class_weights, reduction='mean')

# Stage 1 Hyperparams
epochs_stage1 = 2
learning_rate_stage1 = 2e-5

# Only roberta + classifier trainable, freeze cat and num
for param in model.cat_embeddings.parameters():
    param.requires_grad = False
for param in model.num_mlp.parameters():
    param.requires_grad = False

optimizer = AdamW(model.parameters(), lr=learning_rate_stage1, eps=1e-8)
total_steps_stage1 = len(train_dataloader) * epochs_stage1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps_stage1)

# Early stopping
patience = 3
best_val_loss = float('inf')
patience_counter = 0

def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    eval_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_input_mask, b_cat, b_num, b_labels = tuple(t.to(device) for t in batch)
            logits = model(b_input_ids, b_input_mask, b_cat, b_num)
            loss = loss_fn(logits, b_labels)
            eval_loss += loss.item()
            preds.append(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.append(b_labels.cpu().numpy())
    avg_val_loss = eval_loss / len(dataloader)
    preds = np.concatenate(preds)
    true_labels = np.concatenate(true_labels)
    val_f1 = f1_score(true_labels, preds, average='macro')
    return avg_val_loss, val_f1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.load_state_dict(torch.load('best_model_stage1.pt'))
model.to(device)

  model.load_state_dict(torch.load('best_model_stage1.pt'))


MultiModalRoberta(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
            

In [None]:
print("***** STAGE 1 TRAINING *****")

for epoch_i in range(epochs_stage1):
    model.train()
    total_train_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Stage 1 Epoch {epoch_i+1}/{epochs_stage1}")):
        b_input_ids, b_input_mask, b_cat, b_num, b_labels = tuple(t.to(device) for t in batch)
        
        optimizer.zero_grad()
        logits = model(b_input_ids, b_input_mask, b_cat, b_num)
        loss = loss_fn(logits, b_labels)
        total_train_loss += loss.item()
        
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    val_loss, val_f1 = evaluate(model, validation_dataloader)
    print(f"Stage 1 Epoch {epoch_i+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")
    
    # Early stopping based on val loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_stage1.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping during Stage 1.")
            break
            
# Load best model from stage 1
model.load_state_dict(torch.load('best_model_stage1.pt'))


***** STAGE 1 TRAINING *****


Stage 1 Epoch 1/2:   0%|          | 0/1928 [00:00<?, ?it/s]

In [None]:
# Freeze RoBERTa now
for param in model.roberta.parameters():
    param.requires_grad = False

# Unfreeze cat and numeric layers
for param in model.cat_embeddings.parameters():
    param.requires_grad = True
for param in model.num_mlp.parameters():
    param.requires_grad = True

# You can also keep the classifier head trainable.
# New optimizer & scheduler for stage 2
epochs_stage2 = 4
learning_rate_stage2 = 3e-4

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate_stage2, eps=1e-8)
total_steps_stage2 = len(train_dataloader) * epochs_stage2
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps_stage2)

best_val_loss_stage2 = float('inf')
patience_counter = 0

print("***** STAGE 2 TRAINING *****")
for epoch_i in range(epochs_stage2):
    model.train()
    total_train_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Stage 2 Epoch {epoch_i+1}/{epochs_stage2}")):
        b_input_ids, b_input_mask, b_cat, b_num, b_labels = tuple(t.to(device) for t in batch)
        
        optimizer.zero_grad()
        logits = model(b_input_ids, b_input_mask, b_cat, b_num)
        loss = loss_fn(logits, b_labels)
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    val_loss, val_f1 = evaluate(model, validation_dataloader)
    print(f"Stage 2 Epoch {epoch_i+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")
    
    # Early stopping based on val loss
    if val_loss < best_val_loss_stage2:
        best_val_loss_stage2 = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_final.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping during Stage 2.")
            break

# Load best model from stage 2
model.load_state_dict(torch.load('best_model_final.pt'))


In [None]:
model.eval()
preds, true = [], []
with torch.no_grad():
    for batch in validation_dataloader:
        b_input_ids, b_input_mask, b_cat, b_num, b_labels = tuple(t.to(device) for t in batch)
        logits = model(b_input_ids, b_input_mask, b_cat, b_num)
        p = torch.argmax(logits, dim=1).cpu().numpy()
        t = b_labels.cpu().numpy()
        preds.append(p)
        true.append(t)

preds = np.concatenate(preds)
true = np.concatenate(true)
print("Final Classification Report (Validation):")
print(classification_report(true, preds))


In [None]:
# Extract test data
test_df = data_df[data_df['Usage'] == 'Test'].copy()
test_df['cleaned_title'] = test_df['cleaned_title'].fillna('')
test_df['cleaned_description'] = test_df['cleaned_description'].fillna('')
test_df['Component'] = test_df['Component'].astype(str)
test_df['Status'] = test_df['Status'].astype(str)
test_df['Resolution'] = test_df['Resolution'].astype(str)

# Combine text
test_df['text_input'] = test_df.apply(lambda row: row['cleaned_title'] + " [SEP] " + row['cleaned_description'], axis=1)

# Tokenize test text
test_input_ids, test_attention_masks = [], []
for sent in test_df['text_input']:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

# Categorical features
test_cat_component = test_df['Component'].map(component2idx).values
test_cat_status = test_df['Status'].map(status2idx).values
test_cat_resolution = test_df['Resolution'].map(resolution2idx).values
test_cat_feats = torch.tensor(np.stack([test_cat_component, test_cat_status, test_cat_resolution], axis=1), dtype=torch.long)

# Numeric features
test_num = test_df[['word_count', 'description_word_count']].fillna(0).values.astype(np.float32)
# Normalize with same mean and std from training
test_num = (test_num - num_mean) / num_std
test_num_feats = torch.tensor(test_num, dtype=torch.float32)

# Create Test Dataset and DataLoader
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_cat_feats, test_num_feats)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Load best model
model.load_state_dict(torch.load('best_model_final.pt'))
model.eval()

all_preds = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting on Test"):
        b_input_ids, b_input_mask, b_cat, b_num = tuple(t.to(device) for t in batch)
        logits = model(b_input_ids, b_input_mask, b_cat, b_num)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.append(preds)

all_preds = np.concatenate(all_preds)

# Create submission file
submission_df = test_df[['Issue_id']].copy()
submission_df['Priority'] = all_preds
submission_df.to_csv('submission_roberta.csv', index=False)
print("Submission file 'submission.csv' created successfully!")


In [4]:
!pip install xformers

Collecting xformers
  Downloading xformers-0.0.28.post3-cp39-cp39-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting torch==2.5.1 (from xformers)
  Using cached torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->xformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->xformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->xformers)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->xformers)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1->xformers)
  Using cached nv

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True)

sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [4, 4]

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

torch.Size([4, 4])


In [None]:
embeddings.shape

(4, 1024)

: 