# Project 3: NLP Resume Information Extractor

**Task:** Extract relevant information (Name, Email, Phone, Skills) from PDF resumes.

**Techniques:** PDF Text Mining, Regular Expressions (Regex), Phrase Matching.

## 1. Setup & Functions

In [None]:
import pandas as pd
import re
import os
import PyPDF2
import matplotlib.pyplot as plt
import seaborn as sns

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                content = page.extract_text()
                if content: text += content + " "
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text.strip()

def extract_info(text):
    # 1. Email Extraction via Regex
    email = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    
    # 2. Phone Extraction via Regex
    phone = re.findall(r'\+?\d[\d -]{8,12}\d', text)
    
    # 3. Skills Extraction via Phrase Matching
    skills_db = ['Python', 'SQL', 'Java', 'Machine Learning', 'Big Data', 'Cloud', 'Excel', 'C++', 'NLP', 'React']
    found_skills = [s for s in skills_db if s.lower() in text.lower()]
    
    return {
        'Email': email[0] if email else "N/A",
        'Phone': phone[0] if phone else "N/A",
        'Skills': ", ".join(found_skills) if found_skills else "None detected"
    }

print("‚úÖ Setup and NLP functions ready!")

## 2. Processing Resumes from Local Drive

In [None]:
pdf_dir = r'D:\Facultate\BigData\loan_egibility\data-sets\resumes'
extracted_data = []

if os.path.exists(pdf_dir):
    files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    print(f"Processing {len(files)} PDF files...")
    
    for file in files:
        full_path = os.path.join(pdf_dir, file)
        raw_text = extract_text_from_pdf(full_path)
        info = extract_info(raw_text)
        info['FileName'] = file
        extracted_data.append(info)

    df = pd.DataFrame(extracted_data)
    print("‚úÖ Extraction complete!")
    display(df[['FileName', 'Email', 'Phone', 'Skills']].head())
else:
    print("‚ùå Error: Path not found. Check your D: drive path.")

## 3. Visualization of Detected Skills

In [None]:
if not df.empty:
    skills_series = df['Skills'].str.split(', ').explode()
    skills_series = skills_series[skills_series != "None detected"]
    
    if not skills_series.empty:
        plt.figure(figsize=(12, 6))
        sns.countplot(y=skills_series, palette='viridis', order=skills_series.value_counts().index)
        plt.title('Common Skills Found in Resumes')
        plt.xlabel('Count')
        plt.show()
    else:
        print("No specific skills detected to visualize.")

## 4. Save Results to CSV

In [None]:
output_path = '../data/extracted_resume_results.csv'
df.to_csv(output_path, index=False)
print(f"üíæ Results saved to: {output_path}")