In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [37]:
resume_df = pd.read_csv('data/resumes_train.csv')

In [38]:
def generate_embeddings(texts):
    import os
    import dotenv
    import requests
    dotenv.load_dotenv()
    
    checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
    hf_token = os.getenv('HF_TOKEN') 

    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{checkpoint}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    
    def query(texts):
        response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
        return response.json()
    
    output = query(texts)
    return output

In [39]:
def get_embedding_dataset(resume_df):
    input_texts = resume_df['resume'].tolist()
    text_embeddings = generate_embeddings(input_texts)

    column_names = ["embedding_" + str(i) for i in range(len(text_embeddings[0]))]
    df_train = pd.DataFrame(text_embeddings, columns=column_names)
    df_train['is_data_scientist'] = resume_df['role'] == 'Data Scientist'

    return df_train

In [41]:
# training the dataset on the text embeddings
train_df = get_embedding_dataset(resume_df)
X = train_df.drop(columns=['is_data_scientist'])
y = train_df['is_data_scientist']

clf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X, y)

print("Accuracy and ROC AUC score on the training set")
print("-" * 50)
print(clf.score(X, y))
print(roc_auc_score(y, clf.predict_proba(X)[:, 1]))

Accuracy and ROC AUC score on the training set
--------------------------------------------------
1.0
1.0


In [42]:
# preparing the test dataset
test_df = pd.read_csv('data/resumes_test.csv')
test_df = get_embedding_dataset(test_df)
X_test = test_df.drop(columns=['is_data_scientist'])
y_test = test_df['is_data_scientist']

print("Accuracy and ROC AUC score on the test set")
print("-" * 50)
print(clf.score(X_test, y_test))
print(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

Accuracy and ROC AUC score on the test set
--------------------------------------------------
0.96
0.995


The dataset used was a __synthetic__ dataset. Hence model might be overfitting to the dataset. The model might not perform well on real world data. Additionally the size of the dataset is very small __(100 records)__. Hence the model might not be able to generalize well.