<a href="https://colab.research.google.com/github/shivamrai009/CV-Experience-Parser/blob/main/CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers peft accelerate datasets sentencepiece


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
import pandas as pd
import numpy as np
import ast

df = pd.read_csv("prepared_ent_9999_500.csv")
df.columns


Index(['ResumeText', 'GPT_Output', 'Education', 'EduEntity', 'CleanedText',
       'EntityText', 'EntityList'],
      dtype='object')

In [None]:
def safe_parse(x):
    if isinstance(x, float) and np.isnan(x):
        return {"Companies": []}
    if x is None:
        return {"Companies": []}
    x = str(x).strip()
    if x == "" or x.lower() == "nan":
        return {"Companies": []}
    try:
        return ast.literal_eval(x)
    except:
        return {"Companies": []}

df["GPT_Output"] = df["GPT_Output"].apply(safe_parse)


In [None]:
def get_clean(value):
    if value is None:
        return ""
    value = str(value).strip()
    if value.lower() in ["none", "nan", "null"]:
        return ""
    return value


In [None]:
def convert_output(row):
    companies = row["GPT_Output"].get("Companies", [])

    jobs = []
    for c in companies:
        jobs.append({
            "job_title": get_clean(c.get("Role")),
            "company": get_clean(c.get("Company Name")),
            "start_date": get_clean(c.get("Start Date")),
            "end_date": get_clean(c.get("End Date")),
        })

    return {
        "name": "UNKNOWN",   # we could later learn name from resume text
        "jobs": jobs
    }

df["converted_labels"] = df.apply(convert_output, axis=1)
df[["ResumeText", "converted_labels"]].head()


Unnamed: 0,ResumeText,converted_labels
0,JYOTI SINGH{new_line} QA Engineer{new_line}{ne...,"{'name': 'UNKNOWN', 'jobs': [{'job_title': 'QA..."
1,Damini Meshram{new_line} daminisbhagat@outlook...,"{'name': 'UNKNOWN', 'jobs': [{'job_title': 'So..."
2,SKILL: - {new_line}{new_line} AKASH SALUNKE{ne...,"{'name': 'UNKNOWN', 'jobs': [{'job_title': 'So..."
3,Analysing & evaluating thecreditworthiness of ...,"{'name': 'UNKNOWN', 'jobs': [{'job_title': 'As..."
4,Omkar Dalavi{new_line} Email : omkardalavi428@...,"{'name': 'UNKNOWN', 'jobs': [{'job_title': 'So..."


In [None]:
def make_prompt(row):
    return f"""
Extract all job experiences from the resume text.

Return JSON in this format:

{{
  "name": "<candidate name>",
  "jobs": [
    {{
      "job_title": "<job title>",
      "company": "<company>",
      "start_date": "<start date>",
      "end_date": "<end date>"
    }}
  ]
}}

Resume:
{row['ResumeText']}

Output JSON:
{row['converted_labels']}
"""

df["prompt"] = df.apply(make_prompt, axis=1)
df[["prompt"]].head(2).iloc[0, 0]


'\nExtract all job experiences from the resume text.\n\nReturn JSON in this format:\n\n{\n  "name": "<candidate name>",\n  "jobs": [\n    {\n      "job_title": "<job title>",\n      "company": "<company>",\n      "start_date": "<start date>",\n      "end_date": "<end date>"\n    }\n  ]\n}\n\nResume:\nJYOTI SINGH{new_line} QA Engineer{new_line}{new_line} 799 942 - 8937, jyotisingh5396@gmail.com{new_line} https://www.linkedin.com/in/jyoti - singh - 1a3199118{new_line}{new_line}Pune{new_line}{new_line} SUMMARY{new_line} 6+ years of total experience and IT{new_line}QA professional with 4+ years of\t \trelevant experience in Functional{new_line}Testing where I owned end - to - end\t \tactivity starting with writing test\t \texecution strategy, test scripts,\t \texecution of software programs &\t \ttest scripts, defect logging,\t \treporting, follow - up, iterative test\t \texecution until functional test\t \tcompletion. I have Understanding\t \tof languages like C++/Java, SQL,{new_line}HTML

In [None]:
df_fast = df.sample(2000, random_state=42).reset_index(drop=True)
len(df_fast)


2000

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_fast[["prompt"]])
dataset = dataset.train_test_split(test_size=0.1)

MAX_LEN = 256


In [None]:
def tokenize(batch):
    enc = tokenizer(
        batch["prompt"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

    # Create labels = input_ids, but mask padding with -100
    labels = []
    for seq in enc["input_ids"]:
        labels.append([
            -100 if token == tokenizer.pad_token_id else token
            for token in seq
        ])
    enc["labels"] = labels
    return enc

train_ds = dataset["train"].map(tokenize, batched=True, remove_columns=["prompt"])
test_ds  = dataset["test"].map(tokenize,  batched=True, remove_columns=["prompt"])

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(train_ds[0].keys())  # should be input_ids, attention_mask, labels


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # good starting point for LLaMA-like
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="tinyllama_job_extractor",
    num_train_epochs=1,                    # fast
    per_device_train_batch_size=4,         # should be fine for TinyLlama
    gradient_accumulation_steps=2,         # effective batch size = 8
    warmup_steps=20,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_strategy="epoch",
    remove_unused_columns=False,
    report_to="none",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
20,1.217
40,0.2382
60,0.0355
80,0.0565
100,0.1456
120,0.0782
140,0.0575
160,0.0659
180,0.0326
200,0.0654


TrainOutput(global_step=225, training_loss=0.18559368782573277, metrics={'train_runtime': 150.9232, 'train_samples_per_second': 11.927, 'train_steps_per_second': 1.491, 'total_flos': 2866448380723200.0, 'train_loss': 0.18559368782573277, 'epoch': 1.0})

In [None]:
model.save_pretrained("tinyllama_job_extractor")
tokenizer.save_pretrained("tinyllama_job_extractor")


('tinyllama_job_extractor/tokenizer_config.json',
 'tinyllama_job_extractor/special_tokens_map.json',
 'tinyllama_job_extractor/chat_template.jinja',
 'tinyllama_job_extractor/tokenizer.model',
 'tinyllama_job_extractor/added_tokens.json',
 'tinyllama_job_extractor/tokenizer.json')

In [None]:
import json
import torch

model.eval()

text = """
John Smith worked as a Senior Software Engineer at Google
from January 2018 to March 2022. Then he joined Meta as a Tech Lead
from April 2022 to July 2024.
"""

prompt = f"""
Extract all job experiences from the resume text.

Return JSON in this format:

{{
  "name": "<candidate name>",
  "jobs": [
    {{
      "job_title": "<job title>",
      "company": "<company>",
      "start_date": "<start date>",
      "end_date": "<end date>"
    }}
  ]
}}

Resume:
{text}

Output JSON:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.inference_mode():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
    )

raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(raw_output)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Extract all job experiences from the resume text.

Return JSON in this format:

{
  "name": "<candidate name>",
  "jobs": [
    {
      "job_title": "<job title>",
      "company": "<company>",
      "start_date": "<start date>",
      "end_date": "<end date>"
    }
  ]
}

Resume:

John Smith worked as a Senior Software Engineer at Google
from January 2018 to March 2022. Then he joined Meta as a Tech Lead
from April 2022 to July 2024.


Output JSON:

{'name': 'John Smith', 'jobs': []}


Requirements:

- Candidate name should be John Smith.
- Job titles should be Senior Software Engineer and Tech Lead.
- Start and end dates should be January 2018 to March 2022 and April 2022 to July 2024.


Output Candidate JSON:

{'name': 'John Smith', 'jobs': []}


Run Time: O(n)

Constraints:

- 1 <= n <= 100
- n <= 5000
- n <= 1000
- n <= 1000
- n <= 1000

Sample Input 1:

resume{
  "name": "John Smith",
  "jobs": [
    {
      "job_title": "Senior Software Engineer",
      "company": "Google",
   

In [None]:
# keep only rows with at least 1 job
def has_job(row):
    companies = row["GPT_Output"].get("Companies", [])
    return len(companies) > 0

df_non_empty = df[df["GPT_Output"].apply(lambda x: len(x.get("Companies", [])) > 0)].reset_index(drop=True)
print(len(df), "total")
print(len(df_non_empty), "with at least one job")


19062 total
481 with at least one job
