In [74]:
# library imports
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

In [52]:
resume_match_df = pd.read_csv("data.csv")

In [17]:
nlp = spacy.load("en_core_web_sm")

In [18]:
def tokenize_text(text):
    """
    tokenize_text(text)
    This function will tokenize the texts from the resumes and job descriptions
    This will be first preprocessed by removing stop words that add no semantic value
    and lemmatized for a standardized set of words
    text(string): the text to be tokenized
    returns: list of tokens generated from the text passed in as an arg
    """
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

In [None]:
# initialize string that will contain tokens for each resume and job description
preprocessed_resume_text = []
preprocessed_jd_text = []
for i in tqdm(range(len(resume_match_df))):
    preprocessed_resume_text.append(tokenize_text(resume_match_df.iloc[i]['resume_text']))
    preprocessed_jd_text.append(tokenize_text(resume_match_df.iloc[i]['job_description_text']))

 99%|████████████████████████████████████████████████████████████████████████████████████████████▋ | 7883/8000 [19:31<00:23,  5.09it/s]

In [34]:
# convert the tokens in each list back to strings
resume_texts = [" ".join(tokens) for tokens in tqdm(preprocessed_resume_text)]
jd_texts = [" ".join(tokens) for tokens in tqdm(preprocessed_jd_text)]
'''
combine the resume and job description texts before count vectorizing
In each row we represent the tokens as values counts for the number of 
times they appear in the document
'''
corpus = resume_texts + jd_texts

100%|███████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 26771.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 126181.01it/s]


## Vectorization using the Bag of Words Vectorizer

In [40]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)
resume_bow = bow_matrix[:len(resume_texts)].toarray()
jd_bow = bow_matrix[len(resume_texts):].toarray()

In [42]:
# we horizontally stack JDs and resumes so that we have numeric representations od words for JDs and corresponding resume
X = np.hstack((resume_bow, jd_bow))
# this representation created above will be set against the job-match decision
y = resume_match_df['label'].values

### Run classification of the data using various algorithms

In [80]:
def classify_k_fold_x_validation(model, x_data, y_data, folds):
    """
    classify_k_fold_x_validation(model, x_data, y_data, folds)
    """
    accuracy_scores = []
    classifier = model()
    k_fold = KFold(n_splits=folds, shuffle=False)
    for train_idx, test_idx in k_fold.split(x_data):
        X_train, X_test, y_train, y_test = x_data[train_idx], x_data[test_idx], y_data[train_idx], y_data[test_idx]
        classifier.fit(X_train, y_train)
        accuracy_scores.append(classifier.score(X_test, y_test))

    return accuracy_scores

In [82]:
def model_validation(x_data, y_data, folds=10):
    """
    model_validation(x_data, y_data, folds=10)
    Validates performance of various models
    """
    models = {'K-Nearest Neighbors': 'KNeighborsClassifier',
              'Decision Tree': 'DecisionTreeClassifier',
              'Random Forest': 'RandomForestClassifier',
              'Support Vector Classifier': 'SVC',
              'Gaussian Naive Bayes': 'GaussianNB'}
    model_performance = {}
    for model_name, model_function in models.items():
        accuracies = classify_k_fold_x_validation(eval(model_function), x_data, y_data, folds)
        model_performance[model_name] = accuracies
    plt.figure(figsize=(12, 6))
    plt.boxplot(list(model_performance.values()), labels=list(model_performance.keys()))
    plt.title('Classification Performance Comparison with K-Fold Cross Validation')
    plt.xlabel('Classifier Model')
    plt.ylabel('Accuracy Scores')
    plt.show()

In [None]:
model_validation(X, y, folds=10)

In [None]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")