In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup

In [9]:
def labeling(label):
    return label_mapping[label]

In [10]:

label_mapping = {'INFORMATION-TECHNOLOGY': 0,\
                 'ENGINEERING':1, \
                 'BUSINESS-DEVELOPMENT':2, \
                 'SALES':3,\
                 'HR':4, \
                 'FITNESS': 5 , \
                 'ARTS':6,\
                 'ADVOCATE':7,\
                 'CONSTRUCTION':8,\
                 'AVIATION':9,\
                 'FINANCE':10,\
                 'CHEF':11,\
                 'ACCOUNTANT':12,\
                 'BANKING':13,\
                 'HEALTHCARE':14,\
                 'CONSULTANT':15,\
                 'PUBLIC-RELATIONS':16,\
                 'DESIGNER':17, \
                 'TEACHER':18, \
                 'APPAREL':19, \
                 'DIGITAL-MEDIA':20,\
                 'AGRICULTURE':21, \
                 'AUTOMOBILE':22,\
                 'BPO':23
                 }

In [11]:
train_data = pd.read_csv("data/dataset/train.csv")
val_data = pd.read_csv("data/dataset/val.csv")
test_data = pd.read_csv("data/dataset/test.csv")


train_data.dropna(inplace= True)
val_data.dropna(inplace= True)
test_data.dropna(inplace= True)

train_data["Category"] = train_data["Category"].apply(labeling)
test_data["Category"] = test_data["Category"].apply(labeling)
val_data["Category"] = val_data["Category"].apply(labeling) 

In [12]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def generate_dataloader(df, tokenizer, max_length , batch_size):
    ids = np.zeros((len(df), max_length))
    masks = np.zeros((len(df), max_length))
    Y_labels = df['Category'].values

    for i, text in tqdm(enumerate(df['Resume_clean'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=max_length, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='pt'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    
    X_ids =  torch.tensor(ids, dtype=torch.long)
    X_masks = torch.tensor(masks, dtype=torch.long)
    Y_labels = torch.tensor(Y_labels, dtype=torch.long)
    dataset = TensorDataset(X_ids, X_masks, Y_labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [14]:
train_loader = generate_dataloader(train_data, tokenizer,max_length =256,batch_size = 64)
val_loader = generate_dataloader(val_data, tokenizer,max_length =256,batch_size = 64)


0it [00:00, ?it/s]

2249it [00:08, 265.97it/s]
281it [00:01, 271.13it/s]


In [15]:
# # Define the model architecture
# class TextModel(nn.Module):
#     def __init__(self, num_classes):
#         super(TextModel, self).__init__()
#         self.bert = model
#         self.intermediate_layer = nn.Linear(768, 512)
#         self.output_layer = nn.Linear(512, num_classes)
        
#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)[1]
#         intermediate = self.intermediate_layer(outputs)
#         logits = self.output_layer(intermediate)
#         return logits

In [16]:
# # Define the model architecture with dropout and L2 regularization
# class TextModel(nn.Module):
#     def __init__(self, num_classes, dropout_prob=0.3, l2_reg=1e-5):
#         super(TextModel, self).__init__()
#         self.bert = model
#         self.intermediate_layer = nn.Linear(768, 512)
#         self.dropout = nn.Dropout(dropout_prob)  # Dropout layer added
#         self.output_layer = nn.Linear(512, num_classes)
        
#         # L2 regularization added to linear layers
#         self.intermediate_layer.weight.data = nn.init.kaiming_normal_(self.intermediate_layer.weight.data)
#         self.intermediate_layer.bias.data.fill_(0)
#         self.output_layer.weight.data = nn.init.kaiming_normal_(self.output_layer.weight.data)
#         self.output_layer.bias.data.fill_(0)
        
#         self.l2_reg = l2_reg
    
#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)[1]
#         intermediate = self.intermediate_layer(outputs)
#         intermediate = self.dropout(intermediate)  # Apply dropout
#         logits = self.output_layer(intermediate)
#         return logits

In [17]:
class TextModel(nn.Module):
    def __init__(self, num_classes, dropout_prob=0.3, l2_reg=1e-5, l1_reg=1e-5):
        super(TextModel, self).__init__()
        self.bert = model
        self.intermediate_layer = nn.Linear(768, 512)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer added
        self.output_layer = nn.Linear(512, num_classes)
        
        # L2 regularization added to linear layers
        self.intermediate_layer.weight.data = nn.init.kaiming_normal_(self.intermediate_layer.weight.data)
        self.intermediate_layer.bias.data.fill_(0)
        self.output_layer.weight.data = nn.init.kaiming_normal_(self.output_layer.weight.data)
        self.output_layer.bias.data.fill_(0)
        
        self.l2_reg = l2_reg
        self.l1_reg = l1_reg
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)[1]
        intermediate = self.intermediate_layer(outputs)
        intermediate = self.dropout(intermediate)  # Apply dropout
        logits = self.output_layer(intermediate)
        return logits

    def l1_loss(self):
        l1_loss = torch.tensor(0.).to(device)
        for param in self.parameters():
            if param.dim() > 1:  # Only apply L1 regularization to linear layers
                l1_loss += torch.norm(param, p=1)  # L1 norm
        return self.l1_reg * l1_loss

In [18]:
# num_classes = 24
# dropout_prob = 0.3  # Adjust the dropout probability as needed
# l2_reg = 1e-5  # Adjust the regularization strength as needed
# model = TextModel(num_classes, dropout_prob=dropout_prob, l2_reg=l2_reg)

In [19]:
# Instantiate the model with dropout, L2, and L1 regularization
num_classes = 24
dropout_prob = 0.3  # Adjust the dropout probability as needed
l2_reg = 1e-5  # Adjust the L2 regularization strength as needed
l1_reg = 1e-5  # Adjust the L1 regularization strength as needed
model = TextModel(num_classes, dropout_prob=dropout_prob, l2_reg=l2_reg, l1_reg=l1_reg)


In [20]:
# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

TextModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [21]:
# Define the number of training steps
num_epochs = 25
num_train_steps = len(train_loader) * num_epochs

In [22]:
# Calculate class weights
Y_labels = train_data['Category'].values
class_weights = torch.tensor([1.0 / count for count in np.bincount(Y_labels)], dtype=torch.float)
class_weights = class_weights.to(device)

In [23]:
# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-6)
criterion = nn.CrossEntropyLoss(weight=class_weights) 
# criterion = nn.CrossEntropyLoss() 

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)


In [24]:
# # Define early stopping and model checkpointing
# from sklearn.metrics import accuracy_score
# train_acc =[]
# valid_acc = []
# train_loss =[]


# best_val_accuracy = 0.0
# early_stopping_counter = 0

# # Training loop with early stopping and model checkpointing
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0.0
#     predictions = []
#     targets = []
#     print(f"---------Epoch:{epoch}----------")
#     for batch in tqdm(train_loader):
#         optimizer.zero_grad()
#         input_ids, attn_masks, labels = batch
#         input_ids, attn_masks, labels = input_ids.to(device), attn_masks.to(device), labels.to(device)
        
#         outputs = model(input_ids, attn_masks)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#         predictions.extend(torch.argmax(outputs, dim=1).tolist())
#         targets.extend(labels.tolist())
#     avg_loss = total_loss / len(train_loader)
#     acc = accuracy_score(targets, predictions)

#     # Validation step
#     model.eval()
#     val_predictions = []
#     val_targets = []

#     with torch.no_grad():
#         for val_batch in tqdm(val_loader):
#             val_input_ids, val_attn_masks, val_labels = val_batch
#             val_input_ids, val_attn_masks, val_labels = val_input_ids.to(device), val_attn_masks.to(device), val_labels.to(device)
            
#             val_outputs = model(val_input_ids, val_attn_masks)
#             val_predictions.extend(torch.argmax(val_outputs, dim=1).tolist())
#             val_targets.extend(val_labels.tolist())

#     val_acc = accuracy_score(val_targets, val_predictions)

#     print(f"Epoch {epoch+1}/{num_epochs} - Avg. Loss: {avg_loss:.4f} - Accuracy: {acc:.4f} - Val Accuracy: {val_acc:.4f}")

#     # Check for early stopping
#     if acc > best_val_accuracy:
#         best_val_accuracy = acc
#         early_stopping_counter = 0
#         torch.save(model.state_dict(), f"model_ckpt/best_model_epoch_{epoch+1}_{val_acc}.pt")
#     else:
#         early_stopping_counter += 1
#         if early_stopping_counter >= 8:
#             print("Early stopping triggered.")
#             break
#     train_acc.append(acc)
#     train_loss.append(avg_loss)
#     valid_acc.append(val_acc)

In [25]:
# # Define early stopping and model checkpointing
# from sklearn.metrics import accuracy_score
# train_acc =[]
# valid_acc = []
# train_loss =[]


# best_val_accuracy = 0.0
# early_stopping_counter = 0

# # Training loop with early stopping and model checkpointing
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0.0
#     predictions = []
#     targets = []
#     print(f"---------Epoch:{epoch}----------")
#     for batch in tqdm(train_loader):
#         optimizer.zero_grad()
#         input_ids, attn_masks, labels = batch
#         input_ids, attn_masks, labels = input_ids.to(device), attn_masks.to(device), labels.to(device)
        
#         outputs = model(input_ids, attn_masks)
#         loss = criterion(outputs, labels)
        
#         # Apply L2 regularization to linear layers
#         l2_loss = torch.tensor(0.).to(device)
#         for param in model.parameters():
#             if param.dim() > 1:  # Only apply regularization to linear layers
#                 l2_loss += torch.norm(param, p=2)  # L2 norm
#         loss += l2_reg * l2_loss
        
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#         predictions.extend(torch.argmax(outputs, dim=1).tolist())
#         targets.extend(labels.tolist())
#     avg_loss = total_loss / len(train_loader)
#     acc = accuracy_score(targets, predictions)

#     # Validation step
#     model.eval()
#     val_predictions = []
#     val_targets = []

#     with torch.no_grad():
#         for val_batch in tqdm(val_loader):
#             val_input_ids, val_attn_masks, val_labels = val_batch
#             val_input_ids, val_attn_masks, val_labels = val_input_ids.to(device), val_attn_masks.to(device), val_labels.to(device)
            
#             val_outputs = model(val_input_ids, val_attn_masks)
#             val_predictions.extend(torch.argmax(val_outputs, dim=1).tolist())
#             val_targets.extend(val_labels.tolist())

#     val_acc = accuracy_score(val_targets, val_predictions)

#     print(f"Epoch {epoch+1}/{num_epochs} - Avg. Loss: {avg_loss:.4f} - Accuracy: {acc:.4f} - Val Accuracy: {val_acc:.4f}")

#     # Check for early stopping
#     if acc > best_val_accuracy:
#         best_val_accuracy = acc
#         early_stopping_counter = 0
#         torch.save(model.state_dict(), f"model_ckpt/best_model_epoch_{epoch+1}_{val_acc}.pt")
#     else:
#         early_stopping_counter += 1
#         if early_stopping_counter >= 8:
#             print("Early stopping triggered.")
#             break
#     train_acc.append(acc)
#     train_loss.append(avg_loss)
#     valid_acc.append(val_acc)

In [26]:
# Define early stopping and model checkpointing
from sklearn.metrics import accuracy_score
train_acc =[]
valid_acc = []
train_loss =[]


best_val_accuracy = 0.0
early_stopping_counter = 0

# Training loop with early stopping and model checkpointing
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    predictions = []
    targets = []
    print(f"---------Epoch:{epoch}----------")
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attn_masks, labels = batch
        input_ids, attn_masks, labels = input_ids.to(device), attn_masks.to(device), labels.to(device)
        
        outputs = model(input_ids, attn_masks)
        loss = criterion(outputs, labels)
        
        # Apply L2 and L1 regularization
        l2_loss = torch.tensor(0.).to(device)
        l1_loss = model.l1_loss()
        for param in model.parameters():
            if param.dim() > 1:
                l2_loss += torch.norm(param, p=2)
        loss += l2_reg * l2_loss + l1_loss
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions.extend(torch.argmax(outputs, dim=1).tolist())
        targets.extend(labels.tolist())
    avg_loss = total_loss / len(train_loader)
    acc = accuracy_score(targets, predictions)

    # Validation step
    model.eval()
    val_predictions = []
    val_targets = []

    with torch.no_grad():
        for val_batch in tqdm(val_loader):
            val_input_ids, val_attn_masks, val_labels = val_batch
            val_input_ids, val_attn_masks, val_labels = val_input_ids.to(device), val_attn_masks.to(device), val_labels.to(device)
            
            val_outputs = model(val_input_ids, val_attn_masks)
            val_predictions.extend(torch.argmax(val_outputs, dim=1).tolist())
            val_targets.extend(val_labels.tolist())

    val_acc = accuracy_score(val_targets, val_predictions)

    print(f"Epoch {epoch+1}/{num_epochs} - Avg. Loss: {avg_loss:.4f} - Accuracy: {acc:.4f} - Val Accuracy: {val_acc:.4f}")

    # Check for early stopping
    if acc > best_val_accuracy:
        best_val_accuracy = acc
        early_stopping_counter = 0
        torch.save(model.state_dict(), f"model_ckpt/best_model_epoch_{epoch+1}_{val_acc}.pt")
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= 8:
            print("Early stopping triggered.")
            break
    train_acc.append(acc)
    train_loss.append(avg_loss)
    valid_acc.append(val_acc)

---------Epoch:0----------


100%|██████████| 36/36 [00:37<00:00,  1.05s/it]
100%|██████████| 5/5 [00:01<00:00,  2.80it/s]


Epoch 1/25 - Avg. Loss: 36.0068 - Accuracy: 0.0551 - Val Accuracy: 0.0854
---------Epoch:1----------


100%|██████████| 36/36 [00:38<00:00,  1.07s/it]
100%|██████████| 5/5 [00:01<00:00,  2.77it/s]


Epoch 2/25 - Avg. Loss: 35.4055 - Accuracy: 0.1174 - Val Accuracy: 0.3310
---------Epoch:2----------


100%|██████████| 36/36 [00:38<00:00,  1.08s/it]
100%|██████████| 5/5 [00:01<00:00,  2.73it/s]


Epoch 3/25 - Avg. Loss: 34.8306 - Accuracy: 0.3130 - Val Accuracy: 0.5587
---------Epoch:3----------


100%|██████████| 36/36 [00:39<00:00,  1.09s/it]
100%|██████████| 5/5 [00:01<00:00,  2.70it/s]


Epoch 4/25 - Avg. Loss: 34.1246 - Accuracy: 0.5389 - Val Accuracy: 0.6833
---------Epoch:4----------


100%|██████████| 36/36 [00:39<00:00,  1.09s/it]
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 5/25 - Avg. Loss: 33.5369 - Accuracy: 0.6963 - Val Accuracy: 0.7972
---------Epoch:5----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 6/25 - Avg. Loss: 33.0899 - Accuracy: 0.7946 - Val Accuracy: 0.7936
---------Epoch:6----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 7/25 - Avg. Loss: 32.8475 - Accuracy: 0.8341 - Val Accuracy: 0.8149
---------Epoch:7----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 8/25 - Avg. Loss: 32.6691 - Accuracy: 0.8604 - Val Accuracy: 0.8327
---------Epoch:8----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 9/25 - Avg. Loss: 32.4802 - Accuracy: 0.8844 - Val Accuracy: 0.8256
---------Epoch:9----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 10/25 - Avg. Loss: 32.3941 - Accuracy: 0.8915 - Val Accuracy: 0.8327
---------Epoch:10----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 11/25 - Avg. Loss: 32.2425 - Accuracy: 0.9071 - Val Accuracy: 0.8292
---------Epoch:11----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 12/25 - Avg. Loss: 32.1398 - Accuracy: 0.9186 - Val Accuracy: 0.8399
---------Epoch:12----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 13/25 - Avg. Loss: 32.0577 - Accuracy: 0.9306 - Val Accuracy: 0.8327
---------Epoch:13----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


Epoch 14/25 - Avg. Loss: 31.9421 - Accuracy: 0.9386 - Val Accuracy: 0.8292
---------Epoch:14----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 15/25 - Avg. Loss: 31.8689 - Accuracy: 0.9440 - Val Accuracy: 0.8363
---------Epoch:15----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.70it/s]


Epoch 16/25 - Avg. Loss: 31.7849 - Accuracy: 0.9515 - Val Accuracy: 0.8327
---------Epoch:16----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 17/25 - Avg. Loss: 31.7083 - Accuracy: 0.9480 - Val Accuracy: 0.8363
---------Epoch:17----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]


Epoch 18/25 - Avg. Loss: 31.6254 - Accuracy: 0.9586 - Val Accuracy: 0.8221
---------Epoch:18----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 19/25 - Avg. Loss: 31.5468 - Accuracy: 0.9622 - Val Accuracy: 0.8363
---------Epoch:19----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 20/25 - Avg. Loss: 31.4859 - Accuracy: 0.9609 - Val Accuracy: 0.8292
---------Epoch:20----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 21/25 - Avg. Loss: 31.3983 - Accuracy: 0.9667 - Val Accuracy: 0.8399
---------Epoch:21----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 22/25 - Avg. Loss: 31.3256 - Accuracy: 0.9715 - Val Accuracy: 0.8292
---------Epoch:22----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 23/25 - Avg. Loss: 31.2631 - Accuracy: 0.9755 - Val Accuracy: 0.8292
---------Epoch:23----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 24/25 - Avg. Loss: 31.1940 - Accuracy: 0.9747 - Val Accuracy: 0.8221
---------Epoch:24----------


100%|██████████| 36/36 [00:39<00:00,  1.10s/it]
100%|██████████| 5/5 [00:01<00:00,  2.68it/s]


Epoch 25/25 - Avg. Loss: 31.1245 - Accuracy: 0.9791 - Val Accuracy: 0.8327
