In [1]:
import pandas as pd
import re

# Load your dataset
df = pd.read_csv('Resume.csv')

def calculate_quality_score(resume_text):
    score = 0
    
    # 1. Word count (optimal: 300-600 words)
    word_count = len(str(resume_text).split())
    if 300 <= word_count <= 600:
        score += 20
    elif word_count > 600:
        score += 10
    
    # 2. Check for Email
    has_email = 1 if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', str(resume_text)) else 0
    score += has_email * 15
    
    # 3. Check for Phone
    has_phone = 1 if re.search(r'(\+\d{1,3})?\s?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', str(resume_text)) else 0
    score += has_phone * 15
    
    # 4. Check for Links
    score += 10 if 'linkedin' in str(resume_text).lower() else 0
    score += 10 if 'github' in str(resume_text).lower() else 0
    
    # 5. Standard Sections (Education, Experience, Skills)
    sections = ['education', 'experience', 'skills']
    for section in sections:
        if section in str(resume_text).lower():
            score += 10
            
    return min(score, 100)

In [2]:
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [3]:
# Create the scores
df['quality_score'] = df['Resume_str'].apply(calculate_quality_score)

# Assign a label based on the score
def assign_label(score):
    if score >= 75: return "Good"
    elif score >= 45: return "Average"
    else: return "Poor"

df['label'] = df['quality_score'].apply(assign_label)

# Save the new training set
df.to_csv('labeled_resumes.csv', index=False)
print("Success! 'labeled_resumes.csv' is ready for Day 9.")

Success! 'labeled_resumes.csv' is ready for Day 9.


In [None]:
"""
import pandas as pd # Used to handle the dataset in a table format
import re # Regular Expression library to search for patterns like emails/phones

# 1. LOAD THE DATA
# We load the original dataset to start the labeling process
df = pd.read_csv('Resume.csv') 

def calculate_quality_score(resume_text):
    """
    This function acts as the 'Teacher'. It looks for specific features 
    and assigns a score from 0 to 100.
    """
    score = 0
    text = str(resume_text).lower() # Convert to lowercase for easier searching
    
    # 2. WORD COUNT RULE (Max 20 pts)
    # Why: Recruiters prefer resumes between 300-600 words. Too short is 
    # lazy; too long is hard to read.
    word_count = len(text.split())
    if 300 <= word_count <= 600:
        score += 20 # Perfect length
    elif word_count > 600:
        score += 10 # A bit too long
    
    # 3. CONTACT INFORMATION (Max 30 pts)
    # Why: If a recruiter can't contact you, the resume is useless.
    # We use regex to find email and phone patterns.
    has_email = 1 if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) else 0
    score += has_email * 15
    
    has_phone = 1 if re.search(r'(\+\d{1,3})?\s?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text) else 0
    score += has_phone * 15
    
    # 4. PROFESSIONAL LINKS (Max 20 pts)
    # Why: LinkedIn and GitHub prove your professional identity and work.
    if 'linkedin' in text: score += 10
    if 'github' in text: score += 10
    
    # 5. SECTION CHECK (Max 30 pts)
    # Why: Standard ATS (Applicant Tracking Systems) look for these headers 
    # to organize your data.
    sections = ['education', 'experience', 'skills']
    for section in sections:
        if section in text:
            score += 10 # +10 for each standard section found
            
    return min(score, 100) # Ensure the score never exceeds 100

# 6. APPLY AND CATEGORIZE
# We create the numerical score first
df['quality_score'] = df['Resume_str'].apply(calculate_quality_score)

# Why: Machine Learning models work best with categories (Classification).
# We group the scores into 3 simple labels.
def assign_label(score):
    if score >= 75: return "Good"     # High quality, ready for hire
    elif score >= 45: return "Average"  # Needs some improvement
    else: return "Poor"                # Missing too much information

# Create the final Label column the AI will learn from
df['Label'] = df['quality_score'].apply(assign_label)

# 7. SAVE THE WORK
# We save this as a new file so we don't overwrite our original data.
df.to_csv('labeled_resumes.csv', index=False)

print("Process finished! Your AI Training Data is ready in 'labeled_resumes.csv'.")
"""