# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing

## Setup and Install Dependencies

In [None]:
%pip install pandas tqdm pdfplumber python-docx

In [None]:
%pip install kaggle kagglehub pandas-datasets

In [5]:
import os
import json
import pandas as pd
import pdfplumber
from tqdm import tqdm


##  Load Resume and JD datasets

In [9]:
#import os
import zipfile
#import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

def authenticate_kaggle():
    """Authenticate Kaggle API using kaggle.json file."""
    api = KaggleApi()
    api.authenticate()
    return api

def download_kaggle_dataset(api, dataset_slug: str, download_dir: str):
    """
    Download the Kaggle dataset if it's not already downloaded and extracted.
    
    Args:
        api (KaggleApi): Authenticated Kaggle API instance
        dataset_slug (str): The Kaggle dataset slug (e.g., 'snehaanbhawal/resume-dataset')
        download_dir (str): Directory where the dataset will be stored
    """
    os.makedirs(download_dir, exist_ok=True)
    
    dataset_name = dataset_slug.split("/")[-1]
    zip_path = os.path.join(download_dir, f"{dataset_name}.zip")
    extract_path = os.path.join(download_dir, dataset_name)
    
    if os.path.exists(extract_path):
        print(f"✅ Dataset already extracted at '{extract_path}', skipping download.")
        return extract_path

    print(f"⬇️ Downloading dataset '{dataset_slug}'...")
    api.dataset_download_files(dataset_slug, path=download_dir, quiet=False)

    print(f"📦 Extracting dataset '{zip_path}'...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Extraction completed at '{extract_path}'.")

    return extract_path

def find_csv_file(root_path: str, target_csv: str) -> str:
    """
    Recursively search for the target CSV inside the extracted dataset directory.
    
    Args:
        root_path (str): Directory to start search from
        target_csv (str): Name of the CSV file to find
    
    Returns:
        str: Full path to the CSV file
    """
    for dirpath, _, filenames in os.walk(root_path):
        if target_csv in filenames:
            full_path = os.path.join(dirpath, target_csv)
            print(f"✅ Found CSV at: {full_path}")
            return full_path
    raise FileNotFoundError(f"❌ CSV file '{target_csv}' not found inside '{root_path}'.")

def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame.
    
    Args:
        csv_path (str): Path to the CSV file
    
    Returns:
        pd.DataFrame: Loaded DataFrame
    """
    print(f"📖 Loading CSV file '{csv_path}'...")
    df = pd.read_csv(csv_path)
    print(f"✅ CSV loaded successfully. Shape: {df.shape}")
    return df

def download_and_load_kaggle_dataset(dataset_slug: str, 
                                     download_dir: str = "datasets", 
                                     csv_filename: str = None) -> pd.DataFrame:
    """
    High-level function to authenticate, download, extract, find, and load a Kaggle CSV dataset.
    
    Args:
        dataset_slug (str): The Kaggle dataset slug (e.g., 'snehaanbhawal/resume-dataset')
        download_dir (str): Local directory to store the datasets
        csv_filename (str): Specific CSV file name inside the dataset to load
    
    Returns:
        pd.DataFrame: Loaded pandas DataFrame
    """
    if csv_filename is None:
        raise ValueError("You must specify the CSV filename to load.")

    # Step 1: Authenticate
    api = authenticate_kaggle()

    # Step 2: Download and extract
    extract_path = download_kaggle_dataset(api, dataset_slug, download_dir)

    # Step 3: Find the CSV
    csv_path = find_csv_file(extract_path, csv_filename)

    # Step 4: Load the CSV
    df = load_csv_to_dataframe(csv_path)
    
    return df


In [11]:
# Define your dataset details
dataset_slug = "snehaanbhawal/resume-dataset"
csv_filename = "Resume.csv"

# Load the dataset
df = download_and_load_kaggle_dataset(dataset_slug, download_dir="datasets", csv_filename=csv_filename)

# Quick check
df.head()


✅ Dataset already extracted at 'datasets\resume-dataset', skipping download.
✅ Found CSV at: datasets\resume-dataset\Resume\Resume.csv
📖 Loading CSV file 'datasets\resume-dataset\Resume\Resume.csv'...
✅ CSV loaded successfully. Shape: (2484, 4)


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:
# Define where your datasets are
RESUME_DATASET_DIR = '/path/to/resume/folder/'  # Example: where resume PDFs are extracted
JD_DATASET_PATH = '/path/to/job_description.csv'  # Example: LinkedIn Job Posting CSV
OUTPUT_JSON_DIR = '/structured/json/'
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)


## Minimal Parsing into JSON Structure

### Minimal Resume Parsing (PDF or Text)

In [None]:
def parse_resume_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join([page.extract_text() or '' for page in pdf.pages])
        return text
    except Exception as e:
        print(f"Error parsing {file_path}: {str(e)}")
        return ""


### Minimal JD Parsing (CSV to JSON) 

In [None]:
def parse_job_descriptions(jd_csv_path):
    jd_df = pd.read_csv(jd_csv_path)
    job_descriptions = []

    for idx, row in jd_df.iterrows():
        jd_obj = {
            "title": row.get('job_title', ''),
            "company": row.get('company_name', ''),
            "location": row.get('location', ''),
            "description": row.get('description', '')
        }
        job_descriptions.append(jd_obj)
    
    return job_descriptions


### Build Structured JSON (Resume)

In [None]:
def structure_resume(text):
    return {
        "basics": {
            "name": None,
            "email": None,
            "phone": None,
            "location": None,
            "summary": None
        },
        "education": [],
        "experience": [],
        "skills": [],
        "certifications": [],
        "projects": [],
        "raw_text": text  # Keep raw fallback for now
    }


### Execute Resume Parsing

In [None]:
parsed_resumes = []

for filename in tqdm(os.listdir(RESUME_DATASET_DIR)):
    if filename.endswith(".pdf"):
        file_path = os.path.join(RESUME_DATASET_DIR, filename)
        resume_text = parse_resume_pdf(file_path)
        structured_resume = structure_resume(resume_text)
        structured_resume['source_file'] = filename
        parsed_resumes.append(structured_resume)

# Save parsed resumes
with open(os.path.join(OUTPUT_JSON_DIR, "parsed_resumes.json"), "w") as f:
    json.dump(parsed_resumes, f, indent=2)


### Execute JD Parsing

In [None]:
parsed_jds = parse_job_descriptions(JD_DATASET_PATH)

# Save parsed JDs
with open(os.path.join(OUTPUT_JSON_DIR, "parsed_jds.json"), "w") as f:
    json.dump(parsed_jds, f, indent=2)
