In [None]:
# !pip install -q openai
# !pip install -q python-dotenv
# !pip install -q pyarrow==14.0.1
# !pip install -q evaluate
# !pip install -q bert_score
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
import pandas as pd
import spacy
import re
import os
import openai
from dotenv import load_dotenv
import statistics
from evaluate import load

bertscore = load("bertscore")
bleu = load("bleu")

nlp = spacy.load("en_core_web_lg")

from google.colab import drive
drive.mount('/content/drive')

folder_resume = '/content/drive/MyDrive/LLM_RESUME_DATA/Test_100/'
folder_jd = '/content/drive/MyDrive/LLM_RESUME_DATA/organized_jd/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
load_dotenv('/content/.env')

True

In [11]:
# UNWANTED_WORDS = {'<list', 'Programming languages', 'Web technologies', 'Databases', 'Tools', 'Special skills'}

def clean_text(text):
    # Replace '•' with ', ', remove newlines, and clean spaces
    cleaned_text = re.sub(r'\s*•\s*', ', ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def extract_skills(text):
    # Extract text between <Skills> and </Skills>
    skills = re.findall(r'<Skills>(.*?)</Skills>', text, re.DOTALL)
    if skills:
        skills_text = clean_text(skills[0])
        doc = nlp(skills_text)

        # Extracting noun phrases (common for skill mentions)
        extracted_skills = []
        for chunk in doc.noun_chunks:
            chunk_text = chunk.text.strip()

            # Further split on commas and filter out unwanted words
            for skill in chunk_text.split(','):
                skill = skill.strip()
                # if skill and skill not in UNWANTED_WORDS:  # Ignore unwanted words
                extracted_skills.append(skill)

        return extracted_skills
    return []
def Recall(resume_skills, jd_skills, threshold=0.75):
    Match = 0
    for resume_skill in resume_skills:
      target_doc = nlp(resume_skill)
      for jd_skill in jd_skills:
          word_doc = nlp(jd_skill)
          similarity = target_doc.similarity(word_doc)  # Compute similarity
          if similarity >= threshold:  # Check if similarity exceeds the threshold
            Match+=1
            break
    return Match/len(jd_skills)


In [12]:
from tqdm import tqdm

resumes = os.listdir(folder_resume)
INPUT_RESUME = []
for resume in tqdm(resumes, desc="Loading Resumes"):
  with open(folder_resume + resume,'r') as resume:
    resume = resume.read()
    INPUT_RESUME.append(resume)

folders = os.listdir(folder_jd)
INPUT_JD = []
for folder in tqdm(folders, desc="Loading Job Descriptions"):
  for jd in os.listdir(folder_jd+folder)[:min(len(folder),2)]:
    with open(folder_jd + folder+ '/' + jd , 'r') as jd:
      jd = jd.read()
      INPUT_JD.append(jd)

def zero_shot(INPUT_RESUME, INPUT_JD,model_name = "google/gemma-2-9b-it",folder_proportion_before = None,folder_proportion_after=None, folder_tuned_resume = None, folder_actual_pair = None):
  client = openai.OpenAI(
    api_key=os.getenv('TOGETHER_API'),
    base_url="https://api.together.xyz/v1",
  )
  proportion_before = []
  proportion_after = []

  tuned_resumes = []

  total = 0

  eval_resume = []
  eval_jd = []

  for resume in tqdm(INPUT_RESUME, desc="Processing Resumes"):
    for jd in tqdm(INPUT_JD, desc="Processing JDs", leave=False):
      # print("-----------------------------------------")
      jd_skills = extract_skills(jd)
      resume_skills = extract_skills(resume)

      if (len(jd_skills)==0):
        continue

      try:
        response = client.chat.completions.create(
          model= model_name,
          messages=[
            {"role": "system", "content": '''Fine-tune the Resume according to the providied Job Description. Follow same resume template as input resume like this:
            <Introduction> </Introduction>
            <Work_Experience> <Place> </Place> <Work_Experience>
            <Skills> <list> </list> </Skills>
            <Project> <title> </title> </Description> </Project>
            <Achievement> <list> </list> </Achievement>
            <Hobbies> <list> </list> </Hobbies>
            <Additonal_info> </Additional_info>
            '''},
            {"role": "user", "content": f'''
            RESUME: {resume}\n
            JOB DESCRIPTION: {jd}
            '''},
          ]
        )
      except Exception as e:
        continue

      # print(f"Proportion of Skills matched : {Recall(resume_skills,jd_skills)}")
      pr = Recall(resume_skills,jd_skills)
      proportion_before.append(pr)

      with open(folder_proportion_before,'a') as f:
        f.write(str(pr)+'\n')

      tuned_resume = str(response.choices[0].message.content)
      tuned_skills = extract_skills(tuned_resume)
      # print(f"Proportion of Skills matched : {Recall(tuned_skills,jd_skills)}")
      pr = Recall(tuned_skills,jd_skills)
      proportion_after.append(pr)

      with open(folder_proportion_after,'a') as f:
        f.write(str(pr)+'\n')

      with open(f'{folder_tuned_resume}{total}_resume.txt','a') as f:
        f.write(tuned_resume)


      with open(f'{folder_actual_pair}{total}_jd.txt','a') as f:
        f.write(jd)

      with open(f'{folder_actual_pair}{total}_resume.txt','a') as f:
        f.write(resume)

      tuned_resumes.append(tuned_resume)
      eval_resume.append(resume)
      eval_jd.append(jd)
      # print("-------o be able to use it for predictions and inference----------------------------------")

      total += 1

  answer =  {
      "Average Skill Match Proportion Before" : sum(proportion_before)/total,
      "Average Skill Match Proportion After" : sum(proportion_after)/total,
      "Median Skill Match Proportion Before" :    statistics.median(proportion_before),
      "Median Skill Match Proportion After" : statistics.median(proportion_after),
      "Semantics Before": bertscore.compute(predictions=eval_resume, references=eval_jd, lang='en'),
      "Semantics After":bertscore.compute(predictions=tuned_resumes, references=eval_jd, lang='en'),
      "BLEU Before":bleu.compute(predictions=eval_resume, references=eval_jd),
      "BLEU After": bleu.compute(predictions=tuned_resumes, references=eval_jd)
  }

  return answer

  # print(f"Average Skill Match Proportion Before: {sum(proportion_before)/total}")
  # print(f"Average Skill Match Proportion After:  {sum(proportion_after)/total}")

  # print(f"Median Skill Match Proportion Before: {statistics.median(proportion_before)}")
  # print(f"Median Skill Match Proportion After:  {statistics.median(proportion_after)}")

  # print(f"Semantics Before: {bertscore.compute(predictions=eval_resume, references=eval_jd, lang='en')}")

  # print(f"Semantics After: {bertscore.compute(predictions=tuned_resumes, references=eval_jd, lang='en')}")

  # print(f"BLEU Before: {bleu.compute(predictions=eval_resume, references=eval_jd)}")
  # print(f"BLEU After: {bleu.compute(predictions=tuned_resumes, references=eval_jd)}")


Loading Resumes: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Loading Job Descriptions: 100%|██████████| 7/7 [00:00<00:00, 127.52it/s]


In [13]:
result = zero_shot(INPUT_RESUME, INPUT_JD,model_name ="google/gemma-2-9b-it" ,
          folder_proportion_before =  '/content/drive/MyDrive/LLM_RESUME_DATA/Zero_shot/google-gemma-2-9b-it/proportion_before.txt',
          folder_proportion_after = '/content/drive/MyDrive/LLM_RESUME_DATA/Zero_shot/google-gemma-2-9b-it/proportion_after.txt',
          folder_tuned_resume = '/content/drive/MyDrive/LLM_RESUME_DATA/Zero_shot/google-gemma-2-9b-it/tuned_resume/',
          folder_actual_pair = '/content/drive/MyDrive/LLM_RESUME_DATA/Zero_shot/google-gemma-2-9b-it/actual_pair/'
)

# import os
# import shutil
# import random

# # Paths to your folders
# source_folder = '/content/drive/MyDrive/LLM_RESUME_DATA/structured_resume/'
# train_folder = '/content/drive/MyDrive/LLM_RESUME_DATA/Train/'
# test_folder = '/content/drive/MyDrive/LLM_RESUME_DATA/Test/'

# # # Create train and test directories if they don't exist
# # os.makedirs(train_folder, exist_ok=True)
# # os.makedirs(test_folder, exist_ok=True)

# # Get the list of all files in the source folder
# files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]

# # Shuffle the files to randomize the split
# random.shuffle(files)

# # Calculate the split index
# split_index = int(0.8 * len(files))

# # Split the files
# train_files = files[:split_index]
# test_files = files[split_index:]

# # Copy the files to their respective folders
# for f in train_files:
#     shutil.copy(os.path.join(source_folder, f), os.path.join(train_folder, f))

# for f in test_files:
#     shutil.copy(os.path.join(source_folder, f), os.path.join(test_folder, f))

# print(f"Training files: {len(train_files)}, Testing files: {len(test_files)}")

Processing Resumes:   0%|          | 0/50 [00:00<?, ?it/s]
  similarity = target_doc.similarity(word_doc)  # Compute similarity

Processing JDs:  17%|█▋        | 2/12 [00:04<00:24,  2.42s/it][A
Processing JDs:  25%|██▌       | 3/12 [00:15<00:53,  5.91s/it][A
Processing JDs:  33%|███▎      | 4/12 [00:18<00:39,  4.89s/it][A
Processing JDs:  42%|████▏     | 5/12 [00:29<00:49,  7.03s/it][A
Processing JDs:  50%|█████     | 6/12 [00:46<01:00, 10.13s/it][A
Processing JDs:  58%|█████▊    | 7/12 [00:49<00:40,  8.01s/it][A
Processing JDs:  67%|██████▋   | 8/12 [01:04<00:40, 10.00s/it][A
Processing JDs:  75%|███████▌  | 9/12 [01:08<00:25,  8.38s/it][A
Processing JDs:  83%|████████▎ | 10/12 [01:21<00:19,  9.76s/it][A
Processing JDs:  92%|█████████▏| 11/12 [01:26<00:08,  8.25s/it][A
Processing JDs: 100%|██████████| 12/12 [01:29<00:00,  6.72s/it][A
Processing Resumes:   2%|▏         | 1/50 [01:29<1:13:19, 89.79s/it]
Processing JDs:   0%|          | 0/12 [00:00<?, ?it/s][A
Processing JDs:

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
result

{'Average Skill Match Proportion Before': 0.046847385799714,
 'Average Skill Match Proportion After': 0.08306021121238848,
 'Median Skill Match Proportion Before': 0.0,
 'Median Skill Match Proportion After': 0.0,
 'Semantics Before': {'precision': [0.8032605051994324,
   0.7907987236976624,
   0.7939585447311401,
   0.7924975156784058,
   0.7953853607177734,
   0.7985243201255798,
   0.7977908253669739,
   0.7953409552574158,
   0.7962976098060608,
   0.7921328544616699,
   0.7871915102005005,
   0.7974941730499268,
   0.7909535765647888,
   0.7956061959266663,
   0.795170783996582,
   0.7921838760375977,
   0.7975555658340454,
   0.7968890070915222,
   0.800364077091217,
   0.7943969368934631,
   0.7886441946029663,
   0.7901544570922852,
   0.803492546081543,
   0.7941772937774658,
   0.8014926314353943,
   0.8001143932342529,
   0.7943192720413208,
   0.7832092642784119,
   0.8047382831573486,
   0.8046396374702454,
   0.7930102944374084,
   0.7947299480438232,
   0.789910674095153