In [None]:
### IMPORTS

import numpy as np
import pandas as pd
import json
import re
from transformers import BertTokenizer
import torch
import numpy as np
import random

# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)


In [16]:
# Load the datasets
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Display the first few rows of the training dataset
print("Training Data:")
train_df.head(3)

Training Data:


Unnamed: 0,candidateAId,candidateBId,winnerId,candidateATranscript,candidateBTranscript,candidateAResume,candidateBResume,role
0,8ab47434-09a9-44e6-8c77-f9fd20c57765,d7cbd002-5423-4dae-82d9-3a629ec361bb,8ab47434-09a9-44e6-8c77-f9fd20c57765,{'pairs': [['Interviewer: Hello and welcome to...,{'pairs': [['Interviewer: Hello and welcome to...,"{""data"": {""awards"": [], ""certifications"": [], ...","{""data"": {""awards"": [], ""certifications"": [], ...",communications
1,53c11bf9-3ec7-4909-a9d1-487692e72415,e957aff1-583b-11ef-8a84-4201ac164110,e957aff1-583b-11ef-8a84-4201ac164110,{'pairs': [['Interviewer: Hello and welcome to...,{'pairs': [['Interviewer: Hello! This is a sho...,"{""data"": {""awards"": [], ""certifications"": [], ...","{""data"": {""awards"": [], ""certifications"": [], ...",ops-or-gtm
2,4617b14d-ca26-11ee-a4ba-42010a400021,a2d2933e-c5bb-11ee-a4ba-42010a400021,4617b14d-ca26-11ee-a4ba-42010a400021,{'pairs': [['Interviewer: Hello and welcome to...,{'pairs': [['Interviewer: Hello and welcome to...,"{""data"": {""awards"": [""1st (Winner) AIR 8"", ""Am...","{""data"": {""awards"": [], ""certifications"": [""In...",has-scraping-experience-a


In [17]:
### Data Preprocessing and Tokenization 

import pandas as pd
import re
import json
from transformers import BertTokenizer

# Function to clean text (Transcript)
def clean_text(text):
    # Remove unnecessary special characters
    return re.sub(r'[^A-Za-z0-9\s.,!?]', '', text)

# Function to convert Resume dictionary to text
def dict_to_text(Resume):
    if isinstance(Resume, str):
        try:
            # If Resume is a string, try converting it to a dictionary
            Resume = eval(Resume)  # Be cautious with eval; json.loads is safer for JSON input
        except:
            pass  # If conversion fails, we'll keep it as a string
    if isinstance(Resume, dict):
        Resume_text = ""
        for key, value in Resume.items():
            if isinstance(value, list):
                value = ", ".join(value)
            Resume_text += f"{key}: {value}. "
        return Resume_text
    return str(Resume)  # Return the string if it's not a dict

# Function to preprocess a single row
def preprocess_row(row):
    # Clean the Transcript text
    candidateATranscript_clean = clean_text(row['candidateATranscript'])
    candidateBTranscript_clean = clean_text(row['candidateBTranscript'])

    # Convert the Resume dictionaries to text
    candidateAResume_text = dict_to_text(row['candidateAResume'])
    candidateBResume_text = dict_to_text(row['candidateBResume'])

    # Concatenate Transcript and Resume for both candidates
    candidateA_data = candidateATranscript_clean + " " + candidateAResume_text
    candidateB_data = candidateBTranscript_clean + " " + candidateBResume_text

    # Combine into a tuple for further processing
    return candidateA_data, candidateB_data, row['role']

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the preprocessed data
def preprocess_data(dataframe):
    inputs = []
    labels = []

    for _, row in dataframe.iterrows():
        # Preprocess each row
        candidateA_data, candidateB_data, role = preprocess_row(row)

        # Combine both candidates' text with a separator token and the role as context
        combined_text = role + " [SEP] " + candidateA_data + " [SEP] " + candidateB_data

        # Tokenize the combined input
        encoded_input = tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        inputs.append(encoded_input)
        labels.append(1 if row['winnerId'] == row['candidateAId'] else 0)

    return inputs, labels

# Apply the preprocessing to the training and test datasets
train_inputs, train_labels = preprocess_data(train_df)
test_inputs, test_labels = preprocess_data(test_df)




In [18]:
### Creating Data Loaders

import torch
from transformers import BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Use GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
if torch.backends.mps.is_available():  # Checks for AMD GPU with ROCm support
        device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Convert inputs to TensorDataset
def create_tensor_dataset(inputs, labels):
    input_ids = []
    attention_masks = []

    for encoded_input in inputs:
        input_ids.append(encoded_input['input_ids'])
        attention_masks.append(encoded_input['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)

train_dataset = create_tensor_dataset(train_inputs, train_labels)
test_dataset = create_tensor_dataset(test_inputs, test_labels)


# Create DataLoader for training and testing
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Set up the training parameters
epochs = 9

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from tqdm import tqdm

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()

    total_loss = 0
    correct_predictions = 0

    for batch in tqdm(train_dataloader):
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)

        # Clear gradients
        model.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions.double() / len(train_dataset)

    print(f"Training loss: {avg_train_loss}")
    print(f"Training accuracy: {train_accuracy}")

    # Evaluate on test set
    model.eval()
    correct_predictions = 0

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)

            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    test_accuracy = correct_predictions.double() / len(test_dataset)
    print(f"Test accuracy: {test_accuracy}")


Epoch 1/10


100%|██████████| 3/3 [00:28<00:00,  9.57s/it]


Training loss: 0.7021889090538025
Training accuracy: 0.4
Test accuracy: 0.5853658536585366
Epoch 2/10


100%|██████████| 3/3 [00:29<00:00,  9.75s/it]


Training loss: 0.6723439693450928
Training accuracy: 0.6
Test accuracy: 0.5914634146341463
Epoch 3/10


100%|██████████| 3/3 [00:28<00:00,  9.42s/it]


Training loss: 0.6514999270439148
Training accuracy: 0.7
Test accuracy: 0.4695121951219512
Epoch 4/10


100%|██████████| 3/3 [00:28<00:00,  9.41s/it]


Training loss: 0.6580226421356201
Training accuracy: 0.6
Test accuracy: 0.5060975609756098
Epoch 5/10


100%|██████████| 3/3 [00:28<00:00,  9.41s/it]


Training loss: 0.6133987108866373
Training accuracy: 0.75
Test accuracy: 0.6097560975609756
Epoch 6/10


100%|██████████| 3/3 [00:28<00:00,  9.42s/it]


Training loss: 0.6637960076332092
Training accuracy: 0.65
Test accuracy: 0.573170731707317
Epoch 7/10


100%|██████████| 3/3 [00:30<00:00, 10.05s/it]


Training loss: 0.6260787049929301
Training accuracy: 0.75
Test accuracy: 0.6158536585365854
Epoch 8/10


100%|██████████| 3/3 [00:28<00:00,  9.61s/it]


Training loss: 0.5559169252713522
Training accuracy: 0.75
Test accuracy: 0.5
Epoch 9/10


100%|██████████| 3/3 [00:27<00:00,  9.23s/it]


Training loss: 0.5738959312438965
Training accuracy: 0.75
Test accuracy: 0.6158536585365854
Epoch 10/10


100%|██████████| 3/3 [00:28<00:00,  9.54s/it]


Training loss: 0.5122632086277008
Training accuracy: 0.8
Test accuracy: 0.6036585365853658
