In [5]:
# Import necessary libraries
import pandas as pd  # For loading and working with CSV data (our resume dataset)
import nltk  # Natural Language Toolkit - main NLP library
from nltk.corpus import stopwords  # To access list of common stop words (the, is, a, etc.)
from nltk.tokenize import word_tokenize  # Function to split text into words (tokenization)
import string  # To access punctuation marks (!@#$%^&*().,;:'")
import re  # Regular expressions - for pattern matching and text cleaning

# Download NLTK data packages (run once to download to your computer)
nltk.download('punkt')  # Downloads tokenization rules (how to split sentences/words)
nltk.download('stopwords')  # Downloads list of stop words in multiple languages
nltk.download('wordnet')  # Downloads word meanings database (for lemmatization later)
# Download the missing punkt_tab data
import nltk
nltk.download('punkt_tab')

print("✅ Libraries imported successfully!")  # Confirmation message that all imports worked

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jayaprakash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayaprakash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jayaprakash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jayaprakash/nltk_data...


✅ Libraries imported successfully!


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
# FUNCTION 1: Clean text (lowercase + remove special characters + remove extra spaces)
def clean_text(text):
    """
    Cleans resume text by:
    1. Converting to lowercase
    2. Removing special characters and numbers
    3. Removing extra spaces
    """
    # Convert entire text to lowercase (Python → python)
    text = text.lower()
    
    # Remove special characters and numbers using regex
    # [^a-z\s] means "keep only letters (a-z) and spaces (\s)"
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces (multiple spaces become single space)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing spaces
    text = text.strip()
    
    return text  # Return the cleaned text

# Test the function with a sample
sample_text = "Python Developer with 5+ years experience! Email: test@gmail.com"
cleaned = clean_text(sample_text)
print("Original:", sample_text)
print("Cleaned:", cleaned)

Original: Python Developer with 5+ years experience! Email: test@gmail.com
Cleaned: python developer with years experience email testgmailcom


In [7]:
# FUNCTION 2: Remove stop words (common words like the, is, a, with, etc.)
def remove_stop_words(text):
    """
    Removes stop words from text:
    1. Tokenizes (splits into words)
    2. Filters out stop words
    3. Joins back into sentence
    """
    # Get English stop words from NLTK (the, is, a, an, with, etc.)
    stop_words = set(stopwords.words('english'))
    
    # Split text into individual words (tokenization)
    words = text.split()
    
    # Keep only words that are NOT in stop words list
    # List comprehension: [word for word in words if condition]
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join words back into a sentence with spaces
    return ' '.join(filtered_words)

# Test the function
sample_text = "python developer with years experience email testgmailcom"
filtered = remove_stop_words(sample_text)
print("Before:", sample_text)
print("After removing stop words:", filtered)

Before: python developer with years experience email testgmailcom
After removing stop words: python developer years experience email testgmailcom


In [8]:
# FUNCTION 3: Tokenize text (split into list of words)
def tokenize_text(text):
    """
    Splits text into individual words (tokens)
    Returns a list of words
    """
    # Use NLTK's word_tokenize to split text into words
    # More sophisticated than just .split() - handles punctuation better
    tokens = word_tokenize(text)
    
    return tokens  # Returns list of words

# Test the function
sample_text = "python developer years experience email testgmailcom"
tokens = tokenize_text(sample_text)
print("Original text:", sample_text)
print("Tokenized (list of words):", tokens)
print(f"Number of tokens: {len(tokens)}")

Original text: python developer years experience email testgmailcom
Tokenized (list of words): ['python', 'developer', 'years', 'experience', 'email', 'testgmailcom']
Number of tokens: 6


In [9]:
# Load the resume dataset that we explored in Day 4
df = pd.read_csv('Resume.csv')  # Read CSV file from current folder into pandas dataframe

# Get one real resume from the dataset to test our functions
sample_resume = df['Resume_str'][0]  # Get first resume (index 0) from Resume_str column

# Show original resume (first 300 characters only so it's not too long)
print("=== ORIGINAL RESUME (first 300 chars) ===")
print(sample_resume[:300])  # [:300] means show characters 0 to 299
print("\n" + "="*50 + "\n")  # Print blank line and separator line

# STEP 1: Apply clean_text function (lowercase + remove special chars)
print("STEP 1: Clean text")
cleaned = clean_text(sample_resume)  # Call our function from earlier
print(cleaned[:200])  # Show first 200 characters of cleaned text

print("\n" + "="*50 + "\n")  # Separator

# STEP 2: Apply remove_stop_words function (remove the, is, a, etc.)
print("STEP 2: Remove stop words")
no_stop_words = remove_stop_words(cleaned)  # Remove common meaningless words
print(no_stop_words[:200])  # Show first 200 characters

print("\n" + "="*50 + "\n")  # Separator

# STEP 3: Apply tokenize_text function (split into list of words)
print("STEP 3: Tokenize")
tokens = tokenize_text(no_stop_words)  # Convert sentence to list of words
print("First 20 tokens:", tokens[:20])  # Show first 20 words from the list
print(f"Total tokens: {len(tokens)}")  # Count how many words total

=== ORIGINAL RESUME (first 300 chars) ===
         HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commit


STEP 1: Clean text
hr administratormarketing associate hr administrator summary dedicated customer service manager with years of experience in hospitality and customer service management respected builder and leader of 


STEP 2: Remove stop words
hr administratormarketing associate hr administrator summary dedicated customer service manager years experience hospitality customer service management respected builder leader customerfocused teams 


STEP 3: Tokenize
First 20 tokens: ['hr', 'administratormarketing', 'associate', 'hr', 'administrator', 'summary', 'dedicated', 'customer', 'service', 'manager', 'years', 'experience', 'hospitality', 'cust

In [10]:
# COMBINED FUNCTION: Does all preprocessing steps at once
def preprocess_resume(text):
    """
    Complete preprocessing pipeline for resume text:
    1. Clean text (lowercase + remove special chars)
    2. Remove stop words (common words like the, is, a)
    3. Tokenize (split into list of words)
    
    Input: Raw resume text (string)
    Output: List of cleaned words (list)
    """
    # Step 1: Clean the text
    text = clean_text(text)
    
    # Step 2: Remove stop words
    text = remove_stop_words(text)
    
    # Step 3: Tokenize into list of words
    tokens = tokenize_text(text)
    
    return tokens  # Return final list of cleaned words

# Test the combined function on a sample resume
sample = df['Resume_str'][5]  # Get resume at index 5
processed = preprocess_resume(sample)  # Apply all steps at once

print("Original resume length:", len(sample), "characters")
print("Processed tokens:", len(processed), "words")
print("\nFirst 30 tokens:", processed[:30])

Original resume length: 5480 characters
Processed tokens: 488 words

First 30 tokens: ['hr', 'generalist', 'summary', 'dedicated', 'focused', 'administrative', 'assistant', 'excels', 'prioritizing', 'completing', 'multiple', 'tasks', 'simultaneously', 'following', 'achieve', 'project', 'goals', 'seeking', 'role', 'increased', 'responsibility', 'authority', 'highlights', 'microsoft', 'office', 'proficiency', 'excel', 'spreadsheets', 'meticulous', 'attention']


In [11]:
# Save all preprocessing functions to a Python file
# This creates preprocessing.py in your project folder

code = '''
# preprocessing.py
# Text preprocessing functions for Resume Analyzer project

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# FUNCTION 1: Clean text
def clean_text(text):
    """Converts to lowercase and removes special characters"""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

# FUNCTION 2: Remove stop words
def remove_stop_words(text):
    """Removes common words like the, is, a, with"""
    stop_words = set(stopwords.words('english'))  # Get English stop words
    words = text.split()  # Split into words
    filtered_words = [word for word in words if word not in stop_words]  # Filter
    return ' '.join(filtered_words)  # Join back

# FUNCTION 3: Tokenize text
def tokenize_text(text):
    """Splits text into list of words"""
    tokens = word_tokenize(text)  # Tokenize using NLTK
    return tokens

# COMBINED FUNCTION: Complete preprocessing
def preprocess_resume(text):
    """
    Complete preprocessing pipeline:
    1. Clean text
    2. Remove stop words
    3. Tokenize
    Returns: List of cleaned words
    """
    text = clean_text(text)  # Step 1
    text = remove_stop_words(text)  # Step 2
    tokens = tokenize_text(text)  # Step 3
    return tokens
'''

# Write to file
with open('preprocessing.py', 'w') as f:
    f.write(code)

print("✅ preprocessing.py file created successfully!")
print("✅ All functions saved!")

✅ preprocessing.py file created successfully!
✅ All functions saved!
