In [None]:
from datasets import load_dataset

dataset = load_dataset("AzharAli05/Resume-Screening-Dataset")

dataset.save_to_disk("Dataset/Resume_screening_dataset")


In [45]:

import pandas as pd

df = pd.read_parquet(
    path="Dataset/resume-screening-dataset.parquet"
)

df['Decision'].unique()

array(['reject', 'select'], dtype=object)

In [46]:
df.head()

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...


In [47]:
# Take only relevant columns 'Decision -> target' , 'Resume' , 'Job_Description'

df = df[['Resume' , 'Decision' , 'Job_Description']]

df.head()

Unnamed: 0,Resume,Decision,Job_Description
0,Here's a professional resume for Jason Jones:\...,reject,Be part of a passionate team at the forefront ...
1,Here's a professional resume for Ann Marshall:...,select,Help us build the next-generation products as ...
2,Here's a professional resume for Patrick Mccla...,reject,We need a Human Resources Specialist to enhanc...
3,Here's a professional resume for Patricia Gray...,select,Be part of a passionate team at the forefront ...
4,Here's a professional resume for Amanda Gross:...,reject,We are looking for an experienced E-commerce S...


In [None]:
# import nltk
# nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[

True

In [49]:



import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS

lemmatizer = WordNetLemmatizer()

punctuations = string.punctuation
translator = str.maketrans('', '', string.punctuation)

def preprocess_text(text :str):
    text = text.lower()    

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove punctuations
    text = text.translate(translator)

    # Tokenize the sentence
    tokenized_words = word_tokenize(text)


    cleaned_words = [lemmatizer.lemmatize(word) for word in tokenized_words if word not in STOPWORDS]


    return ' '.join(cleaned_words)


df[['Resume', 'Job_Description']] = df[['Resume', 'Job_Description']].map(preprocess_text)


df['Decision'] = df['Decision'].apply(lambda x : str(x).lower())

In [50]:
# Remove the most frequent words

from collections import Counter




resume_words = []
job_description_words = []

for resume_sentence , job_description_sentence in zip(df['Resume'] , df['Job_Description']):

    resume_words.extend(word_tokenize(resume_sentence))
    job_description_words.extend(word_tokenize(job_description_sentence))


resume_common_words = Counter(resume_words).most_common(n=10)
job_description_common_words = Counter(job_description_words).most_common(n=10)


def remove_common_words(text, common_words):
    # Tokenize the text and remove common words
    tokenized_words = word_tokenize(text.lower())
    cleaned_words = [word for word in tokenized_words if word not in common_words and word.isalpha()]
    return ' '.join(cleaned_words)


df['Resume'] = df['Resume'].apply(lambda x: remove_common_words(x, resume_common_words))
df['Job_Description'] = df['Job_Description'].apply(lambda x: remove_common_words(x, job_description_common_words))

In [None]:
# Convert the target column decision to numbers 


from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['Decision'] = label_encoder.fit_transform(df['Decision'])

print(df[['Decision']].head())  # Check the encoded labels


   Decision
0         0
1         1
2         0
3         1
4         0


In [53]:
cleaned_df = df.copy()

cleaned_df.head()

Unnamed: 0,Resume,Decision,Job_Description
0,here professional resume jason jones jason jon...,0,part passionate team forefront machine learnin...
1,here professional resume ann marshall ann mars...,1,help u build nextgeneration product game devel...
2,here professional resume patrick mcclain patri...,0,need human resource specialist enhance team te...
3,here professional resume patricia gray patrici...,1,part passionate team forefront cloud computing...
4,here professional resume amanda gross amanda g...,0,looking experienced ecommerce specialist join ...


In [54]:
cleaned_df.to_csv("Dataset/cleaned_resume_screening.csv" , index=False)

In [55]:
from sklearn.model_selection import train_test_split


X = cleaned_df.drop("Decision" , axis=1)
y = cleaned_df['Decision']



X_train , X_test , y_train , y_test = train_test_split(X , y , test_size= 0.3)


print(X_train.shape)
print(y_train.shape)

(7121, 2)
(7121,)


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(
    max_features=1000,
    strip_accents="unicode", # Strips accents like Ã© -> e
)


X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)


In [44]:
from sklearn.linear_model import SGDClassifier


sgd_classifier = SGDClassifier(
    loss='log_loss',  # Logistic regression (log loss)
    penalty='l2',  # Regularization
    max_iter=1000,  # Max number of iterations
    tol=1e-3,  # Stopping criterion
    random_state=42,
)

sgd_classifier.partial_fit(X_train_vectors, y_train, classes=y.unique())  # Initial fit on the training data



ValueError: Found input variables with inconsistent numbers of samples: [2, 7121]