# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Setup and Install Dependencies
2. Load Resume and JD datasets
3. Minimal Parsing into JSON Structure
4. Save structured JSON for Phase 2

## Setup and Install Dependencies

In [1]:
%pip install kaggle kagglehub pandas


## Util Classes and methods

### Configurations  

In [2]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
import os
import shutil
import zipfile
import pandas as pd
from pathlib import Path
from typing import List

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs"
    AUTO_CLEANUP = True

    @staticmethod
    def setup_kaggle_credentials():
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, kaggle_path)
            os.chmod(kaggle_path, 0o600)
            print(f"✅ Kaggle credentials setup at {kaggle_path}")
        else:
            print(f"✅ Kaggle credentials already exist at {kaggle_path}")



### Downloader 

In [3]:

# ==============================
# DOWNLOADER
# ==============================
class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [4]:

# ==============================
# LOADER
# ==============================
class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [5]:

# ==============================
# PROCESSOR
# ==============================
class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [6]:


# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")



### Hybrid Data loader

In [7]:


# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Main flow

In [8]:

# ==============================
# MAIN FLOW
# ==============================
def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

## Login and do the processing of Resume and JD dataset

In [9]:
Config.setup_kaggle_credentials()
# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="parsed_jds.json"
)

# Phase 2 -	Parse resume/JD into JSON structured scheme

## Colab + GPU Detection Utilities

In [10]:
def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


##  Install Dependencies  & Login to Hugging Face Hub

In [11]:
if is_running_in_colab():
    # Install the required packages
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub
else:
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes


Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/nightly/cu128
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

## Import Libraries

In [8]:
import json, os, uuid, subprocess, torch
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from typing import List
from pydantic import BaseModel, ValidationError
from huggingface_hub import login
import re




##  Load mistral-Instruct with Fallback to Quantized

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

def load_mistral_pipeline_dynamic(model_name="mistralai/Mistral-7B-Instruct-v0.1", hf_token=None):
    has_cuda = torch.cuda.is_available()
    free_vram_gb = get_available_gpu_memory_gb() if has_cuda else 0
    device_map = {"": 0} if has_cuda else "cpu"
    load_quantized = not has_cuda or free_vram_gb < 14

    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)

    if load_quantized:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            device_map=device_map,
            load_in_8bit=True,
            trust_remote_code=True
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            device_map=device_map,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

    print("🎯 Model loaded on:", next(model.parameters()).device)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=2)

llm_pipeline = load_mistral_pipeline_dynamic(hf_token=HF_TOKEN)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


🎯 Model loaded on: cuda:0


## Define Pydantic Schemas

In [11]:
from typing import Union

class Education(BaseModel):
    degree: str
    field: str
    institution: str
    year: str

class Experience(BaseModel):
    job_title: str
    company: str
    duration: str
    description: str

class ResumeSchema(BaseModel):
    basics: dict
    education: List[Education]
    experience: List[Experience]
    skills: List[str]
    certifications: List[str]
    projects: List[str]

    @classmethod
    def normalize(cls, resume_dict: dict) -> dict:
        resume_dict = dict(resume_dict)

        # Ensure keys are present
        for field in ["education", "experience", "skills", "certifications", "projects"]:
            if not isinstance(resume_dict.get(field), list):
                resume_dict[field] = []

        if not isinstance(resume_dict.get("basics"), dict):
            resume_dict["basics"] = {}

        return resume_dict


class JobDescriptionSchema(BaseModel):
    title: str
    summary: str
    required_experience_years: Union[float, int, str] = 0
    preferred_degrees: List[str] = []
    required_skills: List[str] = []
    certifications: List[str] = []
    soft_skills: List[str] = []

    @classmethod
    def normalize(cls, jd_dict: dict) -> dict:
        """Normalize types to avoid validation crashes."""
        jd_dict = dict(jd_dict)  # copy
        try:
            if isinstance(jd_dict.get("required_experience_years"), str):
                jd_dict["required_experience_years"] = float(
                    jd_dict["required_experience_years"].split()[0]
                )
        except Exception:
            jd_dict["required_experience_years"] = 0.0

        for field in ["preferred_degrees", "required_skills", "certifications", "soft_skills"]:
            if not isinstance(jd_dict.get(field), list):
                jd_dict[field] = []

        return jd_dict


In [12]:
def generate_example_structure(model_class) -> dict:
    """Generate an example JSON structure from a Pydantic model using placeholder values."""
    from typing import get_args, get_origin
    from pydantic import BaseModel

    def default_for_type(field_type):
        origin = get_origin(field_type)
        if origin is list or origin is List:
            return []
        elif field_type == str:
            return ""
        elif field_type in (float, int):
            return 0.0
        elif origin is dict or field_type == dict:
            return {}
        elif issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        return ""

    structure = {}
    for field_name, field in model_class.model_fields.items():
        try:
            structure[field_name] = default_for_type(field.annotation)
        except Exception:
            structure[field_name] = ""
    return structure


##  Prompt Templates

In [13]:
RESUME_PROMPT_TEMPLATE = """
You are a resume-to-JSON parser.

Given the raw resume text below, return a single valid JSON object following this structure:

{schema}

Begin your response with {{
Use "" for strings, [] for lists, and 0.0 for missing numbers.

---
Resume Text:
{text}
---
"""


In [14]:
JD_PROMPT_TEMPLATE = """
You are a job description parser.

Given the raw job description below, return a structured JSON object that follows this structure:

{schema}

Begin your response with {{
Use "" for strings, [] for missing lists, and 0.0 for missing numbers.

---
Job Description:
{text}
---
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [15]:
def generate_llm_output(prompt: str, max_new_tokens: int = 1024) -> str:
    return llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]


In [16]:
def generate_llm_output_old(prompt: str, max_new_tokens: int = 1024) -> str:
    """Run LLM and return the generated text."""
    try:
        return llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")

### Sanitize Output: Strip Prompt, Fix Cutoffs

In [17]:
def sanitize_llm_output(response: str, prompt: str) -> str:
    raw = response.replace(prompt, "").strip()

    # Truncate garbage after the last closing brace
    raw = re.sub(r'}[^}]*$', '}', raw)

    # Remove markdown bullets or --- headers at end
    raw = re.sub(r'(---|•|–|-)\s*$', '', raw, flags=re.MULTILINE)

    return raw


### Regex-based JSON Block Extractor

In [18]:
def extract_json_block(text: str) -> dict:
    """
    Regex-free fallback JSON block extractor using brace balance.
    Finds first balanced {} block.
    """
    stack = []
    start = None
    for i, char in enumerate(text):
        if char == '{':
            if not stack:
                start = i
            stack.append(char)
        elif char == '}':
            if stack:
                stack.pop()
                if not stack:
                    try:
                        return json.loads(text[start:i+1])
                    except json.JSONDecodeError:
                        continue
    raise ValueError("No valid JSON object found in fallback.")

### Final Orchestrator: Fault-Tolerant Extraction

In [19]:
def truncate_text(text: str, max_chars=1500) -> str:
    """Trims long resumes/JDs to prevent LLM overload."""
    return text.strip()[:max_chars]

In [21]:
def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Union[None, type] = None,
    max_new_tokens: int = 1024,
    retries: int = 2,
    validate: bool = True,
) -> dict:
    """
    Runs LLM to extract structured JSON and validates against schema.
    Includes: prompt sanitization, retry, echo detection, brace parser fallback, schema validation.
    """
    example_schema = generate_example_structure(schema_model)
    schema_str = json.dumps(example_schema, indent=2)
    prompt = prompt_template.format(text=truncate_text(text), schema=schema_str)
    raw_output = ""
    attempt = 0

    while attempt <= retries:
        try:
            # Step 1: Get LLM output
            response = generate_llm_output(prompt, max_new_tokens)
            raw_output = sanitize_llm_output(response, prompt)

            # Step 2: Detect schema echo or instruction echo
            if "$schema" in raw_output or "Ensure these rules" in raw_output:
                raise ValueError("LLM echoed schema or instruction block instead of generating JSON.")

            # Step 3: Try JSON load directly
            json_start = raw_output.find("{")
            if json_start == -1:
                raise ValueError("No opening '{' found in LLM output.")

            parsed = json.loads(raw_output[json_start:])

            # Step 4: Optional schema validation
            if validate and schema_model:
                if hasattr(schema_model, "normalize"):
                    parsed = schema_model.normalize(parsed)
                schema_model.model_validate(parsed)

            return parsed

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")
            print("🧪 Raw output was:\n", raw_output[:300])  # Preview first 300 chars
            attempt += 1

    # Step 5: Fallback using brace matching
    try:
        parsed = extract_json_block(raw_output)
        if validate and schema_model:
            if hasattr(schema_model, "normalize"):
                parsed = schema_model.normalize(parsed)
            schema_model.model_validate(parsed)
        return parsed
    except Exception as e:
        return {
            "raw_output": raw_output.strip(),
            "error": f"Regex fallback failed: {e}"
        }

In [22]:

def pydantic_validate(model_class, data):
    """
    Version-safe validator that supports both Pydantic v1 and v2.
    """
    try:
        # Pydantic v2
        return model_class.model_validate(data)
    except AttributeError:
        # Fallback to Pydantic v1
        return model_class.parse_obj(data)


def validate_entry(entry, is_resume):
    try:
        model = ResumeSchema if is_resume else JobDescriptionSchema
        if hasattr(model, "normalize"):
            normalized = model.normalize(entry)
        else:
            normalized = entry
        pydantic_validate(model, normalized)
        return True, None
    except ValidationError as ve:
        return False, str(ve)


##  Normalize in Batches with Validation

In [23]:
def normalize_and_save(
    input_filename,
    output_filename_prefix,
    is_resume=True,
    output_dir=Path("json_outputs"),
    google_drive_sync=True,
    drive_subdir="AI-Resume-Agent",
    limit: int = None,
    resume: bool = True,
    save_every: int = 5,
    STRICT: bool = True,
):

    batch_id = uuid.uuid4().hex[:6]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    valid_file = f"{output_filename_prefix}_{timestamp}_{batch_id}.json"
    invalid_file = f"invalid_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    meta_file = f"run_metadata_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    checkpoint_file = output_dir / f"checkpoint_{output_filename_prefix}.json"

    input_path = output_dir / input_filename
    valid_output_path = output_dir / valid_file
    invalid_output_path = output_dir / invalid_file
    metadata_path = output_dir / meta_file
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load input
    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = [json.loads(line) for line in f.readlines() if line.strip()]

    # Resume support
    start_index = 0
    results, invalids = [], []
    if resume and checkpoint_file.exists():
        with open(checkpoint_file, "r") as ckpt:
            checkpoint = json.load(ckpt)
            start_index = checkpoint.get("last_index", 0)
            print(f"🔁 Resuming from record {start_index}")

    raw_data = raw_data[start_index:]
    if limit is not None:
        raw_data = raw_data[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    for idx, record in enumerate(tqdm(raw_data), start=start_index):
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=True
        )
        
        # Skip any malformed or echoed results
        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
            continue


        # Normalize + validate
        if STRICT:
            is_valid, error_msg = validate_entry(parsed, is_resume)
        else:
            is_valid, error_msg = True, None

        if is_valid:
            results.append(parsed)
        else:
            invalids.append({"input": text, "output": parsed, "error": error_msg})


        # Periodic save for durability
        if (idx + 1 - start_index) % save_every == 0:
            with open(valid_output_path, "w") as f: 
                json.dump(results, f, indent=2)
            with open(invalid_output_path, "w") as f: 
                json.dump(invalids, f, indent=2)
            with open(checkpoint_file, "w") as f: 
                json.dump({"last_index": idx + 1}, f)

    # Final save
    with open(valid_output_path, "w") as f: 
        json.dump(results, f, indent=2)
    with open(invalid_output_path, "w") as f: 
        json.dump(invalids, f, indent=2)
    with open(checkpoint_file, "w") as f: 
        json.dump({"last_index": start_index + len(raw_data)}, f)

    # Metadata
    meta = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_filename,
        "output_valid_file": valid_file,
        "output_invalid_file": invalid_file,
        "record_count": len(raw_data),
        "valid_count": len(results),
        "invalid_count": len(invalids),
        "model": "mistralai/Mistral-7B-Instruct-v0.1",
        "quantized": not any(p.dtype == torch.float16 for p in llm_pipeline.model.parameters()),
        "device": str(next(llm_pipeline.model.parameters()).device)
    }

    with open(metadata_path, "w") as f: 
        json.dump(meta, f, indent=2)

    if google_drive_sync and is_running_in_colab():
        from google.colab import drive
        drive.mount('/content/drive')
        drive_base = Path("/content/drive/MyDrive") / drive_subdir
        drive_base.mkdir(parents=True, exist_ok=True)
        for file in [valid_output_path, invalid_output_path, metadata_path, checkpoint_file]:
            if file.exists():
                file.replace(drive_base / file.name)


## Run Phase 2 End-to-End

In [24]:
def run_phase2_structured_normalization():
    normalize_and_save("parsed_resumes.json", "normalized_resumes", is_resume=True, limit=5, resume=True, STRICT=True)
    normalize_and_save("parsed_jds.json", "normalized_jds", is_resume=False, limit=5, resume=True, STRICT=True)

run_phase2_structured_normalization()

🔁 Resuming from record 75


  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 1 failed: Expecting ',' delimiter: line 7 column 4 (char 87)
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  }


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 2 failed: Expecting ',' delimiter: line 7 column 4 (char 87)
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  }


 20%|██        | 1/5 [01:24<05:37, 84.26s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 3 failed: Expecting ',' delimiter: line 7 column 4 (char 87)
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  }


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 1 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "position": "DIRECTV HR RECRUITER/ Administration",
      "company": "Company Name",
      "location": "City, State",
      "start_date": "

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 2 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "position": "DIRECTV HR RECRUITER/ Administration",
      "company": "Company Name",
      "location": "City, State",
      "start_date": "

 40%|████      | 2/5 [01:49<02:28, 49.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 3 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'DIRECTV HR ... team of 10 employees']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "position": "DIRECTV HR RECRUITER/ Administration",
      "company": "Company Name",
      "location": "City, State",
      "start_date": "

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 1 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "HR BUSINESS PARTNER",
    "summary": "Proactive Human Resources Business Partner guiding performance management, talent planning and benefits. Decisive with proven success providing employee relations support and oversight for efficient operations.

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 2 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "HR BUSINESS PARTNER",
    "summary": "Proactive Human Resources Business Partner guiding performance management, talent planning and benefits. Decisive with proven success providing employee relations support and oversight for efficient operations.

 60%|██████    | 3/5 [02:10<01:13, 36.67s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 3 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'position': 'HR Business..., 'end_date': '01/2021'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "HR BUSINESS PARTNER",
    "summary": "Proactive Human Resources Business Partner guiding performance management, talent planning and benefits. Decisive with proven success providing employee relations support and oversight for efficient operations.

 80%|████████  | 4/5 [02:16<00:24, 24.26s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 1 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "company": "Company Name",
      "location": "City, State",
      "position": "HR Generalist",
      "start_date": "01/2009",
      "end_da

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⚠️ Attempt 2 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "company": "Company Name",
      "location": "City, State",
      "position": "HR Generalist",
      "start_date": "01/2009",
      "end_da

100%|██████████| 5/5 [02:43<00:00, 32.70s/it]

⚠️ Attempt 3 failed: 3 validation errors for ResumeSchema
experience.0.job_title
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.duration
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
experience.0.description
  Field required [type=missing, input_value={'company': 'Company Name...tion, additional paym']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
🧪 Raw output was:
 {
  "basics": {
    "name": "",
    "phone": "",
    "email": "",
    "address": ""
  },
  "education": [],
  "experience": [
    {
      "company": "Company Name",
      "location": "City, State",
      "position": "HR Generalist",
      "start_date": "01/2009",
      "end_da




🔁 Resuming from record 75


  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 20%|██        | 1/5 [00:09<00:39,  9.88s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 40%|████      | 2/5 [00:15<00:22,  7.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 60%|██████    | 3/5 [00:23<00:14,  7.50s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 80%|████████  | 4/5 [00:29<00:06,  6.98s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 5/5 [00:34<00:00,  6.92s/it]
