**Text Classification using BERT Embeddings and XGBoost for Performance Prediction**

In [8]:
# Imports
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Load your dataset
data_path = 'updated_dataset.csv'  # Replace with your dataset file path
df = pd.read_csv(data_path)

# Update these column names to match your dataset
text_column = 'Transcript'  # Column containing textual data
label_column = 'Performance (select/reject)'  # Column containing labels

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    print(f"Total Batches: {total_batches}")
    
    for i in range(total_batches):
        print(f"Processing batch {i + 1}/{total_batches}...")
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return embeddings

# Generate BERT embeddings
df['bert_embeddings'] = get_bert_embeddings_batch(df[text_column].astype(str).tolist(), tokenizer, model)

# Expand embeddings into separate columns
embeddings_expanded = pd.DataFrame(df['bert_embeddings'].tolist(), index=df.index)
embeddings_expanded.columns = [f'emb_{i}' for i in range(embeddings_expanded.shape[1])]

# Combine original data and embeddings
df_expanded = pd.concat([df, embeddings_expanded], axis=1)

# Encode labels
label_encoder = LabelEncoder()
df_expanded['label_encoded'] = label_encoder.fit_transform(df_expanded[label_column])

# Select features and target
embedding_columns = [col for col in df_expanded.columns if col.startswith('emb_')]
X = df_expanded[embedding_columns]
y = df_expanded['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost classifier
model = XGBClassifier(use_label_encoder=False, max_depth=5, n_estimators=200, learning_rate=0.1, eval_metric='logloss')
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

Total Batches: 16
Processing batch 1/16...
Processing batch 2/16...
Processing batch 3/16...
Processing batch 4/16...
Processing batch 5/16...
Processing batch 6/16...
Processing batch 7/16...
Processing batch 8/16...
Processing batch 9/16...
Processing batch 10/16...
Processing batch 11/16...
Processing batch 12/16...
Processing batch 13/16...
Processing batch 14/16...
Processing batch 15/16...
Processing batch 16/16...


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.98
ROC AUC: 0.98


1.Text Preprocessing: Text data is transformed into numerical BERT embeddings using a pre-trained DistilBERT model.

2.Feature Engineering: The embeddings are expanded into separate columns for use in machine learning models.

3.Modeling: XGBoost classifier is employed to predict the performance of candidates based on text features.

4.Evaluation: Accuracy and ROC AUC score are used to assess the model's performance.

**Text Classification with BERT Embeddings and a Custom Neural Network Using PyTorch**

In [2]:
# Imports
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# Load your dataset
data_path = 'updated_dataset.csv'  # Replace with your dataset file path
df = pd.read_csv(data_path)

# Update these column names to match your dataset
text_column = 'Transcript'  # Column containing textual data
label_column = 'Performance (select/reject)'  # Column containing labels

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    print(f"Total Batches: {total_batches}")
    
    for i in range(total_batches):
        print(f"Processing batch {i + 1}/{total_batches}...")
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return embeddings

# Generate BERT embeddings
df['bert_embeddings'] = get_bert_embeddings_batch(df[text_column].astype(str).tolist(), tokenizer, model)

# Expand embeddings into separate columns
embeddings_expanded = pd.DataFrame(df['bert_embeddings'].tolist(), index=df.index)
embeddings_expanded.columns = [f'emb_{i}' for i in range(embeddings_expanded.shape[1])]

# Combine original data and embeddings
df_expanded = pd.concat([df, embeddings_expanded], axis=1)

# Encode labels
label_encoder = LabelEncoder()
df_expanded['label_encoded'] = label_encoder.fit_transform(df_expanded[label_column])

# Select features and target
embedding_columns = [col for col in df_expanded.columns if col.startswith('emb_')]
X = df_expanded[embedding_columns].values
y = df_expanded['label_encoded'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define Neural Network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return self.softmax(x)

# Model, Loss, and Optimizer
input_size = len(embedding_columns)
num_classes = len(np.unique(y))
model = NeuralNetwork(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the Model
def train_model(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for features, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

train_model(model, train_loader, criterion, optimizer)

# Evaluate the Model
def evaluate_model(model, dataloader):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for features, labels in dataloader:
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            predictions.extend(preds.numpy())
            actuals.extend(labels.numpy())
    return predictions, actuals

predictions, actuals = evaluate_model(model, test_loader)

# Calculate Accuracy
accuracy = np.mean(np.array(predictions) == np.array(actuals))
print("Test Accuracy:", accuracy)


Total Batches: 16
Processing batch 1/16...
Processing batch 2/16...
Processing batch 3/16...
Processing batch 4/16...
Processing batch 5/16...
Processing batch 6/16...
Processing batch 7/16...
Processing batch 8/16...
Processing batch 9/16...
Processing batch 10/16...
Processing batch 11/16...
Processing batch 12/16...
Processing batch 13/16...
Processing batch 14/16...
Processing batch 15/16...
Processing batch 16/16...
Epoch 1/10, Loss: 0.6535
Epoch 2/10, Loss: 0.5094
Epoch 3/10, Loss: 0.3828
Epoch 4/10, Loss: 0.3350
Epoch 5/10, Loss: 0.3253
Epoch 6/10, Loss: 0.3210
Epoch 7/10, Loss: 0.3190
Epoch 8/10, Loss: 0.3186
Epoch 9/10, Loss: 0.3170
Epoch 10/10, Loss: 0.3164
Test Accuracy: 1.0


1.Text Embedding: The BERT model is used to generate embeddings from text, which are then used as features for classification.

2.Custom Neural Network: A simple neural network with two hidden layers is defined to classify the text data based on BERT embeddings.

3.DataLoader: PyTorch's DataLoader is used to handle the dataset in batches for training and evaluation.

4.Training and Evaluation: The model is trained for 10 epochs and evaluated using accuracy as the performance metric.

**Comparing Multiple Classifiers for Text Classification Using BERT Embeddings**

In [3]:
# Install required libraries
!pip install transformers torch scikit-learn xgboost pandas catboost

# Imports
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Load your dataset
data_path = 'updated_dataset.csv'  # Replace with your dataset file path
df = pd.read_csv(data_path)

# Update these column names to match your dataset
text_column = 'Transcript'  # Column containing textual data
label_column = 'Performance (select/reject)'  # Column containing labels

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    print(f"Total Batches: {total_batches}")
    
    for i in range(total_batches):
        print(f"Processing batch {i + 1}/{total_batches}...")
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return embeddings

# Generate BERT embeddings
df['bert_embeddings'] = get_bert_embeddings_batch(df[text_column].astype(str).tolist(), tokenizer, model)

# Expand embeddings into separate columns
embeddings_expanded = pd.DataFrame(df['bert_embeddings'].tolist(), index=df.index)
embeddings_expanded.columns = [f'emb_{i}' for i in range(embeddings_expanded.shape[1])]

# Combine original data and embeddings
df_expanded = pd.concat([df, embeddings_expanded], axis=1)

# Encode labels
label_encoder = LabelEncoder()
df_expanded['label_encoded'] = label_encoder.fit_transform(df_expanded[label_column])

# Select features and target
embedding_columns = [col for col in df_expanded.columns if col.startswith('emb_')]
X = df_expanded[embedding_columns]
y = df_expanded['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 1. XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, max_depth=5, n_estimators=200, learning_rate=0.1, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print("\n[XGBoost Results]")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("ROC AUC:", roc_auc_score(y_test, xgb_pred))

# 2. CatBoost Classifier
catboost_model = CatBoostClassifier(depth=6, learning_rate=0.1, iterations=200, verbose=0)
catboost_model.fit(X_train, y_train)
catboost_pred = catboost_model.predict(X_test)
print("\n[CatBoost Results]")
print("Accuracy:", accuracy_score(y_test, catboost_pred))
print("ROC AUC:", roc_auc_score(y_test, catboost_pred))

# 3. RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("\n[RandomForest Results]")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("ROC AUC:", roc_auc_score(y_test, rf_pred))


Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.5/101.7 MB 645.7 kB/s eta 0:02:37
   ---------------------------------------- 0.8/101.7 MB 763.2 kB/s eta 0:02:13
   ---------------------------------------- 1.0/101.7 MB 825.2 kB/s eta 0:02:02
    --------------------------------------- 1.3/101.7 MB 919.8 kB/s eta 0:01:50
    --------------------------------------- 1.6/101.7 MB 987.0 kB/s eta 0:01:42
    --------------------------------------- 1.8/101.7 MB 1

Parameters: { "use_label_encoder" } are not used.




[XGBoost Results]
Accuracy: 0.98
ROC AUC: 0.98

[CatBoost Results]
Accuracy: 0.99
ROC AUC: 0.99

[RandomForest Results]
Accuracy: 1.0
ROC AUC: 1.0


1.Text Embedding: BERT is used to generate embeddings for the text data to capture meaningful information for classification.

2Multiple Classifiers: XGBoost, CatBoost, and RandomForest are used for classification to compare their performance.

3.Model Evaluation: Accuracy and ROC AUC scores are calculated to evaluate the models' performance.

4.Comparing Models: The results help determine which classifier performs the best for the given dataset.