Imports + Load Files

In [12]:
import pandas as pd
import re

# Load labeled Resume–JD pairs with similarity scores
df = pd.read_csv("../data/labeled_resume_jd_dataset.csv")

# Load cleaned resumes and JD files
resumes = pd.read_csv("../data/resumes_cleaned.csv")
jds = pd.read_csv("../data/job_descriptions.csv")

# Create maps to attach full text for processing
resumes_map = dict(zip(resumes['Resume'].apply(lambda x: x[:100] + "..."), resumes['Cleaned_Resume']))
jds_map = dict(zip(jds['Category'] + "_" + jds.index.astype(str), jds['JD_Text']))

df['Resume_Text'] = df['Resume Preview'].map(resumes_map)
df['JD_Text'] = df['JD_Role'].map(jds_map)

df.head(2)



Unnamed: 0,Resume Preview,JD_Role,Similarity_Score,Label,Resume_Text,JD_Text
0,"TECHNICAL SKILLS Skills: Java, SQL, PL/SQL, C,...",Java Developer_0,0.6398,1,technical skill skill java sql sql bootstr...,**Job Title**: Java Backend Developer\n**Locat...
1,"TECHNICAL SKILLS Skills: Java, SQL, PL/SQL, C,...",Java Developer_0,0.6398,1,technical skill skill java sql sql bootstr...,**Job Title**: Java Backend Developer\n**Locat...


Add resume_length Feature

In [13]:
# Resume word count
df['resume_length'] = df['Resume_Text'].apply(lambda x: len(str(x).split()))


Add skill_match_ratio and num_skills_matched

In [14]:
def skill_match_stats(jd_text, resume_text):
    jd_keywords = set(str(jd_text).lower().split())
    resume_words = set(str(resume_text).lower().split())
    if not jd_keywords:
        return (0.0, 0)
    matched = jd_keywords & resume_words
    return (len(matched) / len(jd_keywords), len(matched))

# Apply function
df[['skill_match_ratio', 'num_skills_matched']] = df.apply(
    lambda row: pd.Series(skill_match_stats(row['JD_Text'], row['Resume_Text'])), axis=1
)


Extract years_of_experience (simple regex-based)

In [15]:
def extract_experience(text):
    # Look for patterns like "X years"
    matches = re.findall(r'(\d+)\s+years?', str(text).lower())
    if matches:
        years = [int(m) for m in matches if int(m) < 50]  # avoid false hits like "100 years"
        return max(years) if years else 0
    return 0

df['years_of_experience'] = df['Resume_Text'].apply(extract_experience)




Final Feature Set + Save

In [17]:

# Keep only selected features
final_features = df[[
    'Similarity_Score',
    'skill_match_ratio',
    'resume_length',
    'years_of_experience',
    'Label'
]]

# Save to CSV
final_features.to_csv("../data/final_features_dataset.csv", index=False)
print("✅ Final feature dataset saved.")



✅ Final feature dataset saved.
