In [1]:
# Import necessary libraries for feature extraction
import pandas as pd  # For loading CSV dataset
import re  # Regular expressions for pattern matching (emails, phones, URLs)
from collections import Counter  # For counting word frequencies

# Load the resume dataset
df = pd.read_csv('Resume.csv')  # Load our dataset from Day 4

print("‚úÖ Libraries imported!")
print(f"‚úÖ Dataset loaded: {len(df)} resumes")

‚úÖ Libraries imported!
‚úÖ Dataset loaded: 2484 resumes


In [2]:
# FUNCTION 1: Count total words in resume
def count_words(text):
    """
    Counts total number of words in resume text
    Simple method: split text by spaces and count
    """
    # Split text by whitespace into list of words
    words = text.split()
    
    # Return the count of words in the list
    return len(words)

# Test the function on first resume
sample = df['Resume_str'][0]  # Get first resume
word_count = count_words(sample)  # Count words

print("Sample resume (first 200 chars):")
print(sample[:200])
print(f"\nTotal words: {word_count}")

Sample resume (first 200 chars):
         HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Resp

Total words: 674


In [3]:
# FUNCTION 2: Extract email address using regex (pattern matching)
def extract_email(text):
    """
    Finds and extracts email address from resume text
    Uses regex pattern to match email format: something@something.com
    """
    # Regex pattern for email: word characters + @ + word characters + . + domain
    # \w+ means one or more word characters (letters, numbers, underscore)
    # \. means literal dot (escaped because . has special meaning in regex)
    # Example matches: test@gmail.com, john.doe@company.co.in
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # Search for pattern in text
    match = re.search(email_pattern, text)
    
    # If email found, return it; otherwise return None
    if match:
        return match.group()  # Return the matched email string
    else:
        return None  # No email found

# Test on sample resume
sample = df['Resume_str'][0]
email = extract_email(sample)

print("Testing email extraction:")
print(f"Email found: {email}")

# Test on another resume to see if it finds different email
sample2 = df['Resume_str'][10]
email2 = extract_email(sample2)
print(f"Email from resume 10: {email2}")

Testing email extraction:
Email found: None
Email from resume 10: None


In [4]:
# Test email extraction with sample text that has an email
test_text = "Contact me at john.doe@gmail.com or call 123-456-7890"
email = extract_email(test_text)

print("Test text:", test_text)
print(f"Extracted email: {email}")
print("\n‚úÖ Email extraction function WORKS!")
print("(The actual resumes just don't have emails - probably removed for privacy)")

Test text: Contact me at john.doe@gmail.com or call 123-456-7890
Extracted email: john.doe@gmail.com

‚úÖ Email extraction function WORKS!
(The actual resumes just don't have emails - probably removed for privacy)


In [5]:
# FUNCTION 3: Extract phone number using regex
def extract_phone(text):
    """
    Finds and extracts phone number from resume text
    Matches common phone formats:
    - 123-456-7890
    - (123) 456-7890
    - 123.456.7890
    - 1234567890
    """
    # Regex pattern for phone numbers (US format)
    # \d means digit (0-9)
    # {3} means exactly 3 digits
    # [-.\s]? means optional separator (dash, dot, or space)
    phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    
    # Search for pattern in text
    match = re.search(phone_pattern, text)
    
    # If phone found, return it; otherwise return None
    if match:
        return match.group()  # Return the matched phone string
    else:
        return None

# Test with sample text that has phone number
test_text = "Contact me at john.doe@gmail.com or call 123-456-7890"
phone = extract_phone(test_text)

print("Test text:", test_text)
print(f"Extracted phone: {phone}")
print("\n‚úÖ Phone extraction function WORKS!")

# Also test other formats
test_formats = [
    "Call me at (555) 123-4567",
    "Phone: 555.123.4567",
    "Mobile 5551234567"
]

for text in test_formats:
    phone = extract_phone(text)
    print(f"{text} ‚Üí {phone}")

Test text: Contact me at john.doe@gmail.com or call 123-456-7890
Extracted phone: 123-456-7890

‚úÖ Phone extraction function WORKS!
Call me at (555) 123-4567 ‚Üí (555) 123-4567
Phone: 555.123.4567 ‚Üí 555.123.4567
Mobile 5551234567 ‚Üí 5551234567


In [6]:
# FUNCTION 4: Check for LinkedIn and GitHub URLs
def extract_urls(text):
    """
    Finds LinkedIn and GitHub profile URLs in resume
    Returns a dictionary with both URLs (or None if not found)
    """
    # Convert text to lowercase for easier matching
    text_lower = text.lower()
    
    # Check for LinkedIn URL
    # Pattern matches: linkedin.com/in/username or linkedin.com/profile/...
    linkedin_pattern = r'linkedin\.com/\S+'
    linkedin_match = re.search(linkedin_pattern, text_lower)
    linkedin = linkedin_match.group() if linkedin_match else None
    
    # Check for GitHub URL
    # Pattern matches: github.com/username
    github_pattern = r'github\.com/\S+'
    github_match = re.search(github_pattern, text_lower)
    github = github_match.group() if github_match else None
    
    # Return dictionary with both URLs
    return {
        'linkedin': linkedin,
        'github': github
    }

# Test with sample text containing URLs
test_text = """
John Doe
Software Engineer
LinkedIn: linkedin.com/in/johndoe
GitHub: github.com/johndoe
Email: john@example.com
"""

urls = extract_urls(test_text)
print("Test text has:")
print(f"LinkedIn: {urls['linkedin']}")
print(f"GitHub: {urls['github']}")

# Test on actual resume
sample = df['Resume_str'][0]
urls_real = extract_urls(sample)
print(f"\nActual resume has:")
print(f"LinkedIn: {urls_real['linkedin']}")
print(f"GitHub: {urls_real['github']}")

Test text has:
LinkedIn: linkedin.com/in/johndoe
GitHub: github.com/johndoe

Actual resume has:
LinkedIn: None
GitHub: None


In [7]:
# FUNCTION 5: Count resume sections (Education, Experience, Skills, etc.)
def count_sections(text):
    """
    Counts how many common resume sections are present
    Looks for keywords like: Education, Experience, Skills, Projects, etc.
    More sections = more complete resume
    """
    # Convert to lowercase for easier matching
    text_lower = text.lower()
    
    # List of common resume section keywords
    # These are section headers people use in resumes
    sections = [
        'education',       # Education section
        'experience',      # Work experience
        'skills',          # Technical/soft skills
        'projects',        # Projects section
        'summary',         # Professional summary
        'objective',       # Career objective
        'certifications',  # Certifications/licenses
        'achievements',    # Achievements/awards
        'languages',       # Programming/spoken languages
        'publications'     # Research publications
    ]
    
    # Count how many sections are found in the resume
    section_count = 0
    found_sections = []  # Keep track of which sections were found
    
    for section in sections:
        if section in text_lower:  # Check if section keyword exists in text
            section_count += 1
            found_sections.append(section)  # Add to found list
    
    return {
        'count': section_count,  # Total number of sections found
        'sections': found_sections  # List of which sections were found
    }

# Test on first resume
sample = df['Resume_str'][0]
sections = count_sections(sample)

print("Resume 0 Analysis:")
print(f"Total sections found: {sections['count']}")
print(f"Sections present: {sections['sections']}")

# Test on another resume
sample2 = df['Resume_str'][5]
sections2 = count_sections(sample2)

print(f"\nResume 5 Analysis:")
print(f"Total sections found: {sections2['count']}")
print(f"Sections present: {sections2['sections']}")

Resume 0 Analysis:
Total sections found: 5
Sections present: ['education', 'experience', 'skills', 'summary', 'objective']

Resume 5 Analysis:
Total sections found: 5
Sections present: ['education', 'experience', 'skills', 'summary', 'publications']


In [8]:
# FUNCTION 6: Count technical skills keywords
def count_skills(text):
    """
    Counts how many technical skill keywords are mentioned in resume
    Looks for popular programming languages, tools, frameworks
    More skills = more qualified candidate
    """
    # Convert to lowercase for matching
    text_lower = text.lower()
    
    # List of common technical skills to look for
    # Add more skills based on your target job domain
    skill_keywords = [
        'python', 'java', 'javascript', 'c++', 'sql',  # Programming languages
        'machine learning', 'ml', 'deep learning', 'ai', 'artificial intelligence',  # ML/AI
        'data science', 'data analysis', 'statistics',  # Data skills
        'tensorflow', 'pytorch', 'keras', 'scikit-learn',  # ML frameworks
        'pandas', 'numpy', 'matplotlib',  # Python libraries
        'excel', 'powerpoint', 'tableau', 'power bi',  # Tools
        'aws', 'azure', 'cloud', 'docker', 'kubernetes',  # Cloud/DevOps
        'git', 'github', 'agile', 'scrum'  # Development practices
    ]
    
    # Count how many skills are found
    skills_found = []
    
    for skill in skill_keywords:
        if skill in text_lower:  # Check if skill keyword exists in text
            skills_found.append(skill)
    
    return {
        'count': len(skills_found),  # Total number of skills found
        'skills': skills_found  # List of which skills were found
    }

# Test on first resume
sample = df['Resume_str'][0]
skills = count_skills(sample)

print("Resume 0 Skills Analysis:")
print(f"Total skills found: {skills['count']}")
print(f"Skills mentioned: {skills['skills']}")

# Test on IT resume (should have more technical skills)
# Test on IT resume (should have more technical skills)

# This line has 3 parts - let me break it down:

# Part 1: df[df['Category'] == 'INFORMATION-TECHNOLOGY']
# This FILTERS the dataframe to get only IT resumes
# df['Category'] == 'INFORMATION-TECHNOLOGY' creates a True/False mask
# True for IT resumes, False for all others
# df[mask] keeps only the True rows (IT resumes only)

# Part 2: ['Resume_str']
# From the filtered IT resumes, select the 'Resume_str' column
# Now we have just the resume text of IT people

# Part 3: .iloc[0]
# iloc[0] means "get the first row" (index 0)
# So we get the FIRST IT resume's text
it_resume = df[df['Category'] == 'INFORMATION-TECHNOLOGY']['Resume_str'].iloc[0]
skills_it = count_skills(it_resume)

print(f"\nIT Resume Skills Analysis:")
print(f"Total skills found: {skills_it['count']}")
print(f"Skills mentioned: {skills_it['skills']}")

Resume 0 Skills Analysis:
Total skills found: 4
Skills mentioned: ['ai', 'data analysis', 'statistics', 'aws']

IT Resume Skills Analysis:
Total skills found: 3
Skills mentioned: ['ml', 'ai', 'excel']


In [9]:
# FINAL STEP: Test all 6 functions on 5 different resumes

# Create a master function that runs ALL 6 feature extraction functions at once
def analyze_resume(resume_text):
    """
    Complete feature extraction - runs all 6 functions
    Returns a dictionary with all extracted features
    """
    # Return dictionary containing results from all 6 functions
    return {
        'word_count': count_words(resume_text),  # Function 1: Count total words in resume
        'email': extract_email(resume_text),  # Function 2: Find email address using regex
        'phone': extract_phone(resume_text),  # Function 3: Find phone number using regex
        'urls': extract_urls(resume_text),  # Function 4: Find LinkedIn and GitHub URLs
        'sections': count_sections(resume_text),  # Function 5: Count resume sections
        'skills': count_skills(resume_text)  # Function 6: Count technical skills mentioned
    }

# Print separator line (60 equal signs for visual separation)
print("="*60)
# Print header for our analysis section
print("ANALYZING 5 SAMPLE RESUMES")
# Print separator line again
print("="*60)

# Create a list of 5 resume indices we want to test
# We'll test resumes at positions 0, 10, 20, 30, and 40 from our dataset
test_indices = [0, 10, 20, 30, 40]

# Loop through each index in our test_indices list
for idx in test_indices:
    # Get the resume text at this index from the 'Resume_str' column
    resume = df['Resume_str'][idx]
    
    # Get the job category for this resume (HR, IT, Finance, etc.)
    category = df['Category'][idx]
    
    # Run our master analyze_resume function on this resume text
    # This calls all 6 functions at once and stores results in 'features' dictionary
    features = analyze_resume(resume)
    
    # Start displaying the results for this resume
    # Print blank line for spacing
    print(f"\n{'='*60}")
    
    # Print resume number and its job category
    print(f"RESUME {idx} - Category: {category}")
    
    # Print separator line
    print(f"{'='*60}")
    
    # Print word count (access the 'word_count' key from features dictionary)
    print(f"üìù Word Count: {features['word_count']}")
    
    # Print email (will be None if not found)
    print(f"üìß Email: {features['email']}")
    
    # Print phone number (will be None if not found)
    print(f"üì± Phone: {features['phone']}")
    
    # Print LinkedIn URL (access 'linkedin' key inside 'urls' dictionary)
    print(f"üîó LinkedIn: {features['urls']['linkedin']}")
    
    # Print GitHub URL (access 'github' key inside 'urls' dictionary)
    print(f"üíª GitHub: {features['urls']['github']}")
    
    # Print sections count and list of sections found
    # features['sections'] is a dict with 'count' and 'sections' keys
    print(f"üìã Sections: {features['sections']['count']} sections - {features['sections']['sections']}")
    
    # Print skills count and list of skills found
    # features['skills'] is a dict with 'count' and 'skills' keys
    print(f"üéØ Skills: {features['skills']['count']} skills - {features['skills']['skills']}")

# After loop ends, print final completion message
print(f"\n{'='*60}")  # Blank line and separator
print("‚úÖ DAY 6 COMPLETE! All 6 functions tested on 5 resumes!")  # Success message
print(f"{'='*60}")  # Final separator line

ANALYZING 5 SAMPLE RESUMES

RESUME 0 - Category: HR
üìù Word Count: 674
üìß Email: None
üì± Phone: None
üîó LinkedIn: None
üíª GitHub: None
üìã Sections: 5 sections - ['education', 'experience', 'skills', 'summary', 'objective']
üéØ Skills: 4 skills - ['ai', 'data analysis', 'statistics', 'aws']

RESUME 10 - Category: HR
üìù Word Count: 679
üìß Email: None
üì± Phone: None
üîó LinkedIn: None
üíª GitHub: None
üìã Sections: 4 sections - ['education', 'experience', 'skills', 'summary']
üéØ Skills: 2 skills - ['ai', 'aws']

RESUME 20 - Category: HR
üìù Word Count: 620
üìß Email: None
üì± Phone: None
üîó LinkedIn: None
üíª GitHub: None
üìã Sections: 4 sections - ['education', 'experience', 'skills', 'summary']
üéØ Skills: 3 skills - ['ai', 'excel', 'powerpoint']

RESUME 30 - Category: HR
üìù Word Count: 794
üìß Email: None
üì± Phone: None
üîó LinkedIn: None
üíª GitHub: None
üìã Sections: 4 sections - ['education', 'experience', 'skills', 'objective']
üéØ Skills: 2