In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import pandas as pd
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = pd.read_csv('data/dataset/train.csv')
test_dataset = pd.read_csv('data/dataset/test.csv')
val_dataset = pd.read_csv('data/dataset/val.csv')

In [3]:
num_class = len(train_dataset.Category.unique())

In [4]:
num_class

24

In [5]:

label_mapping = {'INFORMATION-TECHNOLOGY': 0,\
                 'ENGINEERING':1, \
                 'BUSINESS-DEVELOPMENT':2, \
                 'SALES':3,\
                 'HR':4, \
                 'FITNESS': 5 , \
                 'ARTS':6,\
                 'ADVOCATE':7,\
                 'CONSTRUCTION':8,\
                 'AVIATION':9,\
                 'FINANCE':10,\
                 'CHEF':11,\
                 'ACCOUNTANT':12,\
                 'BANKING':13,\
                 'HEALTHCARE':14,\
                 'CONSULTANT':15,\
                 'PUBLIC-RELATIONS':16,\
                 'DESIGNER':17, \
                 'TEACHER':18, \
                 'APPAREL':19, \
                 'DIGITAL-MEDIA':20,\
                 'AGRICULTURE':21, \
                 'AUTOMOBILE':22,\
                 'BPO':23
                 }

In [6]:

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = label_mapping[self.labels[idx]]  # Map string label to numerical label
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)  # Convert numerical label to tensor
        }


In [7]:
# Hyperparameters
batch_size = 16
num_epochs = 10
learning_rate = 2e-5
max_length = 514


In [8]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_class)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
class_counts = train_dataset.Category.value_counts().to_list()
total_samples = sum(class_counts)
class_weights = [total_samples / count for count in class_counts]
class_weights = torch.tensor(class_weights, dtype=torch.float)


In [10]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)  # Specify class weights for imbalanced data
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [11]:
train_texts = train_dataset['Resume_clean'].values
train_labels = train_dataset['Category'].values

val_texts = val_dataset['Resume_clean'].values
val_labels = val_dataset['Category'].values

# train loader
train_data = CustomDataset(train_texts, train_labels, tokenizer, max_length)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# val loader
val_data = CustomDataset(val_texts, val_labels, tokenizer, max_length)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)


In [12]:
train_texts = train_dataset['Resume_clean'].values
train_texts

array(['director marketing communications core omplishments initiated student leadership development program student leadership development program resulting graduates position decision making responsibility cahs student leadership apprentice program big boom business planning institute host outlook agriculture weekly radio show kpvu fm sirius xm hbcu channel experience current director marketing communications company name city state develop strategies based knowledge policy nature market trend projections facilitate growth well aid resiliency planning college triad encompassing cooperative extension program cooperative agricultural research center academics manage college internal external communications systems conjunction university public relations press marketing communications units public private sector assess translate materials print online social media create systems procedures maintain manage contact lists associates prospective associates implement event registration proce

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [21]:
device = torch.device("cuda")  # Use GPU 0

In [22]:
device

device(type='cuda')

In [23]:
train_dataset.Category.value_counts()

Category
INFORMATION-TECHNOLOGY    136
ENGINEERING               130
BUSINESS-DEVELOPMENT      128
SALES                     125
HR                        123
FITNESS                   117
ARTS                      111
ADVOCATE                  110
CONSTRUCTION              109
AVIATION                   94
FINANCE                    94
CHEF                       94
ACCOUNTANT                 94
BANKING                    92
HEALTHCARE                 92
CONSULTANT                 92
PUBLIC-RELATIONS           89
DESIGNER                   86
TEACHER                    82
APPAREL                    78
DIGITAL-MEDIA              77
AGRICULTURE                50
AUTOMOBILE                 29
BPO                        18
Name: count, dtype: int64

In [24]:

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        all_predictions = []
        all_labels = []

        total_loss = 0.0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = outputs.logits.argmax(dim=1).cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(all_labels, all_predictions)
        average_loss = total_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{num_epochs}] - "
              f"Accuracy: {accuracy:.4f}, Loss: {average_loss:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model.state_dict()
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= 5:
                print(f"Early stopping at epoch {epoch+1}")
                break

if best_model is not None:
    torch.save(best_model, "best_model.pth")
# This code will train the model, print accuracy and loss per epoch based on the test data, and implement early stopping when the accuracy doesn't improve for a specified number of epochs. The best model based on validation accuracy will be saved to "best_model.pth" at the end. Make sure to replace placeholders like train_texts, train_labels, test_texts, test_labels, and adjust the num_classes and class_weights based on your problem.







RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
