### Data Exploration

The research paper uses sentence embedding on noun and noun phrases. This analysis is to see of other modern approaches can reach same or better score. Ultimately, we want to see if the new approach at least mataches the high scores annotated. 

1. Compare embedding of entire resume to individual entities of a category. Expect sim,ilarity to be less than the research paper
2. Colbert index and search. 

In [1]:
import pandas as pd
import json

### Run the following code for O*Net Knowledge Excel to CSV

```python
###### Load the O*NET Knowledge Excel file
knowledge_file = "data/annotations_scenario_1/Knowledge.xlsx"  # Update with the actual filename
df_onet = pd.read_excel(knowledge_file)

##### Select relevant columns
df_onet = df_onet[["O*NET-SOC Code", "Title", "Element Name", "Scale ID", "Data Value"]]

##### Filter for only importance (IM) and level (LV)
df_onet = df_onet[df_onet["Scale ID"].isin(["IM", "LV"])]

##### Rename columns for consistency
df_onet.rename(columns={
    "O*NET-SOC Code": "onetsoc_code",
    "Title": "job_title",
    "Element Name": "knowledge_entity",
    "Scale ID": "scale_id",
    "Data Value": "data_value"
}, inplace=True)

#### Display the processed data in a Pandas DataFrame
print(df_onet.head())  # Show the first few rows

#### Save to CSV if you want to inspect it further
df_onet.to_csv("data/annotations_scenario_1/processed_onet_knowledge.csv", index=False)


### Run the following to convert O*Net Occupation Excel to CSV

```python
# Load the O*NET Knowledge Excel file
occupation_file = "data/annotations_scenario_1/Occupation Data.xlsx"  # Update with the actual filename

df_occupation = pd.read_excel(occupation_file)

# Select relevant columns
df_occupation = df_occupation[["O*NET-SOC Code", "Title", "Description"]]

# Rename columns for consistency
df_occupation.rename(columns={
    "O*NET-SOC Code": "onetsoc_code",
    "Title": "job_title",
    "Description": "job_description"
}, inplace=True)

# Display the first few rows
print(df_occupation.head())

# Save to CSV for further inspection (optional)
df_occupation.to_csv("data/annotations_scenario_1/processed_onet_occupation.csv", index=False)


### create annotions database
```python
!sqlite3 ../data/annotations_scenario_1/annotations_scenario_1.db < ../data/annotations_scenario_1/annotations_scenario_1.sql

In [9]:
import sqlite3
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Connect to the SQLite database
conn = sqlite3.connect("../data/annotations_scenario_1/annotations_scenario_1.db")

# Step 1: Query 10 resumes with rating = 5
query_resumes = """
SELECT r.id AS resume_id, r.resume_text, pj.job_title
FROM resumes r
JOIN annotations a ON r.id = a.resume_id
JOIN predicted_jobs pj ON r.id = pj.resume_id
WHERE a.rating = 5
LIMIT 10;
"""
df_resumes = pd.read_sql_query(query_resumes, conn)

# Close the connection
conn.close()

# Step 2: Load the O*NET knowledge dataset (previously processed)
df_onet = pd.read_csv("../data/annotations_scenario_1/processed_onet_knowledge.csv")

# Step 3: Initialize an empty list to store similarity results
similarity_results = []

# Step 4: Compute similarity for each resume and its corresponding job knowledge entities
for _, row in df_resumes.iterrows():
    resume_id = row["resume_id"]
    resume_text = row["resume_text"]
    job_title = row["job_title"]

    # Get knowledge entities for this job title
    df_knowledge = df_onet[df_onet["job_title"] == job_title]

    if df_knowledge.empty:
        print(f"⚠️ No knowledge entities found for job: {job_title} (Resume ID: {resume_id})")
        continue  # Skip if no knowledge data exists for this job

    # Generate embeddings
    resume_embedding = model.encode(resume_text, convert_to_numpy=True)
    knowledge_embeddings = df_knowledge["knowledge_entity"].apply(lambda x: model.encode(x, convert_to_numpy=True))

    # Compute similarity
    similarity_scores = cosine_similarity([resume_embedding], list(knowledge_embeddings))

    # Store results
    for knowledge_entity, score in zip(df_knowledge["knowledge_entity"], similarity_scores[0]):
        similarity_results.append({"resume_id": resume_id, "job_title": job_title, "knowledge_entity": knowledge_entity, "similarity_score": score})

# Convert results to DataFrame
df_similarity = pd.DataFrame(similarity_results)

# Remove duplicates if any remain
df_similarity.drop_duplicates(inplace=True)

# Print a preview of the similarity matrix
print(df_similarity.head())

# Save to CSV for further analysis
df_similarity.to_csv("../data/annotations_scenario_1/resume_knowledge_similarity_matrix.csv", index=False)

print("✅ Similarity matrix saved as 'resume_knowledge_similarity_matrix.csv'.")


   resume_id             job_title               knowledge_entity  \
0          4  Computer Programmers  Administration and Management   
2          4  Computer Programmers                 Administrative   
4          4  Computer Programmers       Economics and Accounting   
6          4  Computer Programmers            Sales and Marketing   
8          4  Computer Programmers  Customer and Personal Service   

   similarity_score  
0          0.309047  
2          0.231593  
4          0.223551  
6          0.269907  
8          0.232240  
✅ Similarity matrix saved as 'resume_knowledge_similarity_matrix.csv'.


In [10]:
# Print the highest similarity score
max_similarity = df_similarity["similarity_score"].max()
highest_match = df_similarity[df_similarity["similarity_score"] == max_similarity]

print("\n🎯 Highest Similarity Score:")
print(highest_match)





🎯 Highest Similarity Score:
     resume_id                                     job_title  \
346          5                   Computer Network Architects   
412          5                          Computer Programmers   
478          5         Computer Systems Engineers/Architects   
544          5  Computer and Information Research Scientists   
610          5                            Robotics Engineers   

              knowledge_entity  similarity_score  
346  Computers and Electronics          0.418219  
412  Computers and Electronics          0.418219  
478  Computers and Electronics          0.418219  
544  Computers and Electronics          0.418219  
610  Computers and Electronics          0.418219  


In [11]:
import torch
from ragatouille import RAGPretrainedModel

# Load ColBERT-based RAG model from Ragatouille
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


[Mar 28, 19:14:00] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


  self.scaler = torch.cuda.amp.GradScaler()


In [12]:

# Step 1: Group by resume_id to ensure one resume per ID
df_resumes_grouped = df_resumes.groupby("resume_id")["resume_text"].first().reset_index()

# Step 2: Index Each Resume Independently
for resume_id, resume_text in zip(df_resumes_grouped["resume_id"], df_resumes_grouped["resume_text"]):
    index_name = f"resume_{resume_id}"  # Unique index name per resume
    
    print(f"Indexing Resume ID: {resume_id}...")  # Debugging output
    RAG.index(
        collection=[resume_text],  # Store the full resume as a single document
        index_name=index_name,
        max_document_length=180,
        split_documents=True,  # Ragatouille will handle chunking
        use_faiss=True
    )

print("✅ All resumes have been indexed successfully (only once per ID)!")


Indexing Resume ID: 4...


[Mar 28, 19:14:45] #> Creating directory .ragatouille/colbert/indexes/resume_4 


[Mar 28, 19:14:45] [0] 		 #> Encoding 6 passages..


  self.scaler = torch.cuda.amp.GradScaler()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
100%|██████████| 1/1 [00:00<00:00,  3.11it/s]

[Mar 28, 19:14:46] [0] 		 avg_doclen_est = 136.0 	 len(local_sample) = 6
[Mar 28, 19:14:46] [0] 		 Creating 256 partitions.
[Mar 28, 19:14:46] [0] 		 *Estimated* 816 embeddings.
[Mar 28, 19:14:46] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/resume_4/plan.json ..



  sub_sample = torch.load(sub_sample_path)
  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')


Clustering 776 points in 128D to 256 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.00 s
[0.026, 0.035, 0.024, 0.038, 0.02, 0.037, 0.038, 0.028, 0.034, 0.028, 0.03, 0.036, 0.03, 0.029, 0.039, 0.036, 0.026, 0.043, 0.035, 0.038, 0.035, 0.037, 0.037, 0.025, 0.028, 0.035, 0.043, 0.03, 0.043, 0.037, 0.032, 0.048, 0.046, 0.037, 0.036, 0.032, 0.026, 0.038, 0.041, 0.032, 0.037, 0.039, 0.039, 0.033, 0.038, 0.037, 0.025, 0.036, 0.038, 0.043, 0.032, 0.041, 0.041, 0.029, 0.031, 0.036, 0.022, 0.031, 0.035, 0.039, 0.031, 0.034, 0.041, 0.032, 0.043, 0.036, 0.042, 0.03, 0.033, 0.033, 0.031, 0.033, 0.029, 0.035, 0.032, 0.03, 0.035, 0.039, 0.03, 0.032, 0.03, 0.036, 0.034, 0.031, 0.032, 0.035, 0.03, 0.034, 0.034, 0.03, 0.031, 0.037, 0.034, 0.034, 0.033, 0.022, 0.032, 0.036, 0.041, 0.042, 0.036, 0.04, 0.038, 0.032, 0.034, 0.025, 0.026, 0.037, 0.034, 0.026, 0.037, 0.032, 0.036, 0.04, 0.037, 0.033, 0.038, 0.039, 0.031, 0.029, 0.021, 0.029, 0.031, 0.034, 0.032, 0.032, 0.032, 0.034]


0it [00:00, ?it/s]

[Mar 28, 19:14:46] [0] 		 #> Encoding 6 passages..



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  5.57it/s][A
1it [00:00,  5.33it/s]
  return torch.load(codes_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 1773.49it/s]

[Mar 28, 19:14:46] #> Optimizing IVF to store map from centroids to list of pids..
[Mar 28, 19:14:46] #> Building the emb2pid mapping..
[Mar 28, 19:14:46] len(emb2pid) = 816



100%|██████████| 256/256 [00:00<00:00, 75765.02it/s]

[Mar 28, 19:14:46] #> Saved optimized IVF to .ragatouille/colbert/indexes/resume_4/ivf.pid.pt





Done indexing!
Indexing Resume ID: 5...
New index_name received! Updating current index_name (resume_4) to resume_5


[Mar 28, 19:14:46] #> Creating directory .ragatouille/colbert/indexes/resume_5 


[Mar 28, 19:14:47] [0] 		 #> Encoding 3 passages..


100%|██████████| 1/1 [00:00<00:00,  3.72it/s]

[Mar 28, 19:14:47] [0] 		 avg_doclen_est = 131.0 	 len(local_sample) = 3
[Mar 28, 19:14:47] [0] 		 Creating 256 partitions.
[Mar 28, 19:14:47] [0] 		 *Estimated* 393 embeddings.
[Mar 28, 19:14:47] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/resume_5/plan.json ..






Clustering 374 points in 128D to 256 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.00 s
[0.035, 0.033, 0.044, 0.049, 0.043, 0.033, 0.04, 0.038, 0.05, 0.027, 0.036, 0.03, 0.047, 0.028, 0.037, 0.037, 0.044, 0.043, 0.03, 0.045, 0.031, 0.029, 0.04, 0.031, 0.03, 0.036, 0.041, 0.04, 0.041, 0.052, 0.035, 0.032, 0.037, 0.033, 0.045, 0.045, 0.034, 0.021, 0.035, 0.047, 0.043, 0.047, 0.047, 0.04, 0.038, 0.03, 0.037, 0.033, 0.048, 0.042, 0.03, 0.051, 0.046, 0.049, 0.038, 0.036, 0.034, 0.039, 0.028, 0.045, 0.032, 0.061, 0.051, 0.038, 0.036, 0.041, 0.034, 0.034, 0.043, 0.033, 0.035, 0.044, 0.031, 0.036, 0.028, 0.039, 0.046, 0.05, 0.039, 0.046, 0.037, 0.03, 0.063, 0.034, 0.035, 0.035, 0.044, 0.045, 0.054, 0.032, 0.041, 0.046, 0.048, 0.039, 0.041, 0.042, 0.036, 0.043, 0.035, 0.035, 0.031, 0.045, 0.03, 0.047, 0.041, 0.026, 0.026, 0.033, 0.046, 0.028, 0.047, 0.035, 0.031, 0.043, 0.046, 0.041, 0.035, 0.045, 0.046, 0.023, 0.054, 0.026, 0.039, 0.038, 0.036, 0.037, 0.033, 0.031]


0it [00:00, ?it/s]

[Mar 28, 19:14:47] [0] 		 #> Encoding 3 passages..



100%|██████████| 1/1 [00:00<00:00, 12.30it/s]
1it [00:00, 11.45it/s]
100%|██████████| 1/1 [00:00<00:00, 2465.79it/s]

[Mar 28, 19:14:47] #> Optimizing IVF to store map from centroids to list of pids..
[Mar 28, 19:14:47] #> Building the emb2pid mapping..
[Mar 28, 19:14:47] len(emb2pid) = 393



100%|██████████| 256/256 [00:00<00:00, 88315.66it/s]

[Mar 28, 19:14:47] #> Saved optimized IVF to .ragatouille/colbert/indexes/resume_5/ivf.pid.pt
Done indexing!
✅ All resumes have been indexed successfully (only once per ID)!





In [21]:
df_resumes["resume_id"].unique()


array([4, 5])

In [37]:
import pandas as pd
import json

# Step 1: Prepare queries (all unique O*NET knowledge entities)
knowledge_queries = df_onet["knowledge_entity"].drop_duplicates().tolist()

# Step 2: Search for each indexed resume
similarity_results = []
for resume_id in df_resumes["resume_id"].unique():
    index_name = f"resume_{resume_id}"  # Resume index name
    print(f"🔍 Searching Resume ID: {resume_id}...")  # Debugging output

    # Iterate over each knowledge query instead of stacking them
    for query in knowledge_queries:
        retrieved_docs = RAG.search(query=query, index_name=index_name, k=3)  

        # Store results with raw similarity scores
        for doc in retrieved_docs:
            matched_text = doc["content"]
            similarity_score = doc["score"]  # Keep raw score

            similarity_results.append({
                "resume_id": resume_id,
                "query": query,
                "matched_resume_chunk": matched_text,
                "similarity_score": round(similarity_score, 4)  # Keep raw score
            })

# Convert results to DataFrame
df_similarity = pd.DataFrame(similarity_results)

# Step 3: Find the highest similarity score (raw, unnormalized)
if not df_similarity.empty:
    max_similarity = df_similarity["similarity_score"].max()
    highest_match = df_similarity[df_similarity["similarity_score"] == max_similarity]

    print("\n🎯 Highest Raw Similarity Score using Ragatouille:")
    print(highest_match)
else:
    print("\n🚨 No results found.")

# Save to CSV
df_similarity.to_csv("../data/annotations_scenario_1/ragatouille_resume_knowledge_similarity_matrix.csv", index=False)
print("✅ Similarity matrix saved as 'ragatouille_resume_knowledge_similarity_matrix.csv'.")


🔍 Searching Resume ID: 4...
New index_name received! Updating current index_name (resume_5) to resume_4
Loading searcher for index resume_4 for the first time... This may take a few seconds
[Mar 28, 20:42:32] #> Loading codec...
[Mar 28, 20:42:32] #> Loading IVF...
[Mar 28, 20:42:32] #> Loading doclens...


  self.scaler = torch.cuda.amp.GradScaler()
  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')
  ivf, ivf_lengths = torch.load(os.path.join(self.index_path, "ivf.pid.pt"), map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 5940.94it/s]

[Mar 28, 20:42:32] #> Loading codes and residuals...



  return torch.load(codes_path, map_location='cpu')
  return torch.load(residuals_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 760.25it/s]
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Administration and Management, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([ 101,    1, 3447, 1998, 2968,  102,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

🔍 Searching Resume ID: 5...
New index_name received! Updating current index_name (resume_4) to resume_5
Loading searcher for index resume_5 for the first time... This may take a few seconds
[Mar 28, 20:42:38] #> Loading codec...
[Mar 28, 20:42:38] #> Loading IVF...
[Mar 28, 20:42:38] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 3446.43it/s]

[Mar 28, 20:42:38] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 1232.89it/s]

Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Administration and Management, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([ 101,    1, 3447, 1998, 2968,  102,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])







🎯 Highest Raw Similarity Score using Ragatouille:
     resume_id                       query  \
125          5  Engineering and Technology   

                                  matched_resume_chunk  similarity_score  
125  Reports and Forecasts Education Details PGP in...           16.3687  
✅ Similarity matrix saved as 'ragatouille_resume_knowledge_similarity_matrix.csv'.


In [38]:

# Filter similarity scores >= 10
df_filtered = df_similarity[df_similarity["similarity_score"] >= 10]

# Display the filtered results
print("\n🎯 Filtered Results (Scores >= 10):")
print(df_filtered)


🎯 Filtered Results (Scores >= 10):
     resume_id                          query  \
12           4  Customer and Personal Service   
24           4      Computers and Electronics   
25           4      Computers and Electronics   
66           4         Education and Training   
67           4         Education and Training   
93           4       Communications and Media   
94           4       Communications and Media   
123          5      Computers and Electronics   
125          5     Engineering and Technology   
126          5     Engineering and Technology   
135          5                    Mathematics   
161          5         Education and Training   

                                  matched_resume_chunk  similarity_score  
12   Face recognition is the recognizing a special ...           12.3463  
24   Expertise Data and Quantitative Analysis Decis...           11.4292  
25   Reports and Forecasts Education Details PGP in...           10.3551  
66   Reports and Forecasts

In [39]:
# Load processed O*NET knowledge data (which includes job mapping)
df_onet_knowledge = pd.read_csv("../data/annotations_scenario_1/processed_onet_knowledge.csv")

# Filter similarity scores >= 10
df_filtered = df_similarity[df_similarity["similarity_score"] >= 10]

# Merge similarity results with O*NET knowledge data (to get jobs)
df_merged = df_filtered.merge(df_onet_knowledge, left_on="query", right_on="knowledge_entity", how="left")

# Keep only relevant columns
df_merged = df_merged[["resume_id", "query", "matched_resume_chunk", "similarity_score", "data_value", "onetsoc_code", "job_title"]]

# Display merged dataset
print("\n🔍 Merged Data with Job Titles:")
print(df_merged.head())



🔍 Merged Data with Job Titles:
   resume_id                          query  \
0          4  Customer and Personal Service   
1          4  Customer and Personal Service   
2          4  Customer and Personal Service   
3          4  Customer and Personal Service   
4          4  Customer and Personal Service   

                                matched_resume_chunk  similarity_score  \
0  Face recognition is the recognizing a special ...           12.3463   
1  Face recognition is the recognizing a special ...           12.3463   
2  Face recognition is the recognizing a special ...           12.3463   
3  Face recognition is the recognizing a special ...           12.3463   
4  Face recognition is the recognizing a special ...           12.3463   

   data_value onetsoc_code                        job_title  
0        4.39   11-1011.00                 Chief Executives  
1        5.94   11-1011.00                 Chief Executives  
2        3.41   11-1011.03    Chief Sustainability Off

In [40]:
# Compute the weighted job score: Sum of (similarity_score * data_value) per job
df_job_scores = df_merged.groupby(["resume_id", "job_title"]) \
    .apply(lambda x: (x["similarity_score"] * x["data_value"]).sum()) \
    .reset_index(name="job_score")

# Display computed job scores
print("\n🏆 Predicted Job Scores:")
print(df_job_scores.head())

# Save to CSV
# df_job_scores.to_csv("../data/annotations_scenario_1/ragatouille_predicted_jobs.csv", index=False)
print("✅ Predicted job scores saved as 'ragatouille_predicted_jobs.csv'.")



🏆 Predicted Job Scores:
   resume_id                 job_title   job_score
0          4  Accountants and Auditors  493.753572
1          4                    Actors  495.708854
2          4                 Actuaries  482.022268
3          4            Acupuncturists  548.439976
4          4         Acute Care Nurses  547.672771
✅ Predicted job scores saved as 'ragatouille_predicted_jobs.csv'.


  .apply(lambda x: (x["similarity_score"] * x["data_value"]).sum()) \


In [41]:
# Find the best matching job per resume (highest job score)
df_best_jobs = df_job_scores.loc[df_job_scores.groupby("resume_id")["job_score"].idxmax()]

# Display the best jobs
print("\n🎯 Final Predicted Jobs for Each Resume:")
print(df_best_jobs)

# Save to CSV
df_best_jobs.to_csv("../data/annotations_scenario_1/ragatouille_final_predicted_jobs.csv", index=False)
# print("✅ Final predicted jobs saved as 'ragatouille_final_predicted_jobs.csv'.")



🎯 Final Predicted Jobs for Each Resume:
      resume_id                                 job_title   job_score
162           4  Computer Science Teachers, Postsecondary  781.786266
1148          5       Engineering Teachers, Postsecondary  709.636067


In [48]:
df_resumes.loc[df_resumes["resume_id"] == 4, "resume_text"][0]

'Expertise Data and Quantitative Analysis Decision Analytics Predictive Modeling Data-Driven Personalization KPI Dashboards Big Data Queries and Interpretation Data Mining and Visualization Tools Machine Learning Algorithms Business Intelligence ( BI ) Research, Reports and Forecasts Education Details PGP in Data Science Mumbai, Maharashtra Aegis School of data science & Business B. E. in Electronics & Communication Electronics & Communication Indore, Madhya Pradesh IES IPS Academy Data Scientist Data Scientist with PR Canada Skill Details Algorithms- Exprience - 6 months BI- Exprience - 6 months Business Intelligence- Exprience - 6 months Machine Learning- Exprience - 24 months Visualization- Exprience - 24 months spark- Exprience - 24 months python- Exprience - 36 months tableau- Exprience - 36 months Data Analysis- Exprience - 24 monthsCompany Details company - Aegis school of Data Science & Business description - Mostly working on industry project for providing solution along with 

In [52]:
# Load resumes

# Load O*NET job descriptions
df_onet_jobs = pd.read_csv("../data/annotations_scenario_1/processed_onet_occupation.csv")

# Display job descriptions to verify structure
print(df_onet_jobs.head())


  onetsoc_code                            job_title  \
0   11-1011.00                     Chief Executives   
1   11-1011.03        Chief Sustainability Officers   
2   11-1021.00      General and Operations Managers   
3   11-1031.00                          Legislators   
4   11-2011.00  Advertising and Promotions Managers   

                                     job_description  
0  Determine and formulate policies and provide o...  
1  Communicate and coordinate with management, sh...  
2  Plan, direct, or coordinate the operations of ...  
3  Develop, introduce, or enact laws and statutes...  
4  Plan, direct, or coordinate advertising polici...  


In [56]:
# Define the research-specific job mapping
research_jobs = {
    "Data Science": ["Data Scientists", "15-2051.00"],
    "Human Resources": ["Human Resources Specialists", "13-1071.00"],
    "Advocate": ["Advocate", "21-1093.00"],
    "Web Designer": ["Web Designer", "15-1254.00"],
    "Mechanical Engineer": ["Mechanical Engineer", "17-2141.00"],
    "Sales": ["Sales Managers", "11-2022.00"],
    "Health and fitness": ["Wellness Coach", "11-9179.01"],
    "Civil Engineer": ["Civil Engineer", "17-2051.00"],
    "Java Developer": ["Java Developer", "15-1251.00"],
    "Business Analyst": ["Business Analyst", "13-1111.00"],
    "SAP Developer": ["Software Analyst", "15-1211.00"],
    "Automation Testing": ["Automation Tester", "15-1253.00"],
    "Electrical Engineering": ["Electrical Engineers", "17-2071.00"],
    "Operations Manager": ["Operations Manager", "11-1021.00"],
    "Python Developer": ["Programmer", "15-1251.00"],
    "DevOps Engineer": ["DevOps Engineer", "15-1252.00"],
    "Network Security Engineer": ["Network Security Engineer", "15-1299.04"],
    "PMO": ["Personnel Officer", "13-1071.00"],
    "Database": ["Database Manager", "15-1242.00"],
    "Hadoop": ["Data Storage Specialist", "15-1242.00"],
    "ETL Developer": ["Electronic Data Interchange System Developer (EDI System Developer)", "15-1299.08"],
    "DotNet Developer": [".NET Developer", "15-1252.00"],
    "Blockchain": ["Blockchain Developer", "15-1299.07"],
    "Testing": ["Tester", "15-1299.04"]
}

# Convert to a lookup DataFrame for easier filtering
df_research_jobs = pd.DataFrame([
    {"category": key, "job_title": value[0], "onetsoc_code": value[1]} for key, value in research_jobs.items()
])

# Display the research job list for verification
print(df_research_jobs)


                     category  \
0                Data Science   
1             Human Resources   
2                    Advocate   
3                Web Designer   
4         Mechanical Engineer   
5                       Sales   
6          Health and fitness   
7              Civil Engineer   
8              Java Developer   
9            Business Analyst   
10              SAP Developer   
11         Automation Testing   
12     Electrical Engineering   
13         Operations Manager   
14           Python Developer   
15            DevOps Engineer   
16  Network Security Engineer   
17                        PMO   
18                   Database   
19                     Hadoop   
20              ETL Developer   
21           DotNet Developer   
22                 Blockchain   
23                    Testing   

                                            job_title onetsoc_code  
0                                     Data Scientists   15-2051.00  
1                         Human Reso

In [60]:
# Load predicted jobs from ColBERT matching
df_predicted_jobs = pd.read_csv("./data/annotations_scenario_1/ragatouille_final_predicted_jobs.csv")

# Merge predicted jobs with research jobs to filter only relevant ones
df_filtered_jobs = df_predicted_jobs.merge(df_research_jobs, on="job_title", how="inner")

# Display filtered results
print("\n🎯 Filtered Predicted Jobs (Only Relevant to Research):")
print(df_filtered_jobs.head())

# Save filtered job predictions
df_filtered_jobs.to_csv("../data/annotations_scenario_1/ragatouille_filtered_research_jobs.csv", index=False)
print("✅ Filtered research jobs saved as 'ragatouille_filtered_research_jobs.csv'.")


FileNotFoundError: [Errno 2] No such file or directory: './data/annotations_scenario_1/ragatouille_final_predicted_jobs.csv'