Import Libraries


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch
import xgboost as xgb



Load and Preprocess Data

In [None]:
# Load your dataset
df = pd.read_csv('train_dataset.csv')

# Example preprocessing
object_columns = [col for col in df.columns if df[col].dtype == 'object']
df[object_columns] = df[object_columns].apply(lambda x: x.astype('category').cat.codes)

# Split data into features and target
X = df[['candidateATranscript', 'candidateBTranscript', 'role', 'candidateAResumeData', 'candidateBResumeData']]
y = df['winnerId']

# Convert labels to binary (assuming winnerId is categorical with two possible values)
y = (y == 'candidateA').astype(int)  # Example conversion

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Tokenize and Embed Text Data

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Example of embedding text columns
X_train_embeddings = embed_text(X_train['candidateATranscript'].tolist())
X_test_embeddings = embed_text(X_test['candidateATranscript'].tolist())


Train the Model

In [None]:
# Combine embeddings with other features
X_train_features = pd.concat([pd.DataFrame(X_train_embeddings), X_train[['role']].reset_index(drop=True)], axis=1)
X_test_features = pd.concat([pd.DataFrame(X_test_embeddings), X_test[['role']].reset_index(drop=True)], axis=1)

# Train XGBoost model
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train_features, y_train)


Evaluate the Model

In [None]:
# Predict and evaluate
y_pred = model_xgb.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


For Making Predictions on New Data

In [None]:
# Example function to preprocess and embed new text data
def preprocess_and_embed(new_data):
    
    new_embeddings = embed_text(new_data['candidateATranscript'].tolist())

    new_features = pd.concat([pd.DataFrame(new_embeddings), new_data[['role']].reset_index(drop=True)], axis=1)
    return new_features

# Load new data
new_data = pd.read_csv('test_dataset.csv')

# Preprocess and embed new data
new_data_features = preprocess_and_embed(new_data)


# Predict with the trained model
new_predictions = model_xgb.predict(new_data_features)
print(new_predictions)


Saving and Loading Models

In [None]:
# Save the trained model
model_xgb.save_model('path')

# Load the trained model
model_xgb = xgb.XGBClassifier()
model_xgb.load_model('path')
