In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
import pickle

INPUT_CSV = Path("../data/processed/nco_with_descriptions.csv")
df = pd.read_csv(INPUT_CSV, dtype=str)

print("Loaded:", df.shape)
df.head()


Loaded: (3598, 4)


Unnamed: 0,nco_2015_code,title,nco_2004_code,description
0,1111.01,"Elected Official, Union Government",1111.1,"Occupation Title: Elected Official, Union Gove..."
1,1111.02,"Elected Official, State Government",1112.1,"Occupation Title: Elected Official, State Gove..."
2,1111.03,"Elected Official, Local Bodies",1113.1,"Occupation Title: Elected Official, Local Bodi..."
3,1111.99,"Legislators, Other",1119.9,"Occupation Title: Legislators, Other. NCO 2015..."
4,1112.01,"Administrative Official, Union Government",1121.1,"Occupation Title: Administrative Official, Uni..."


In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded successfully")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|█| 103/103 [00:00<00:00, 312.27it/s,
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully


In [4]:
texts = df["description"].tolist()

embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

print("Embeddings shape:", embeddings.shape)


Batches: 100%|██████████| 113/113 [00:30<00:00,  3.76it/s]


Embeddings shape: (3598, 384)


In [5]:
dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index created. Total vectors:", index.ntotal)


FAISS index created. Total vectors: 3598


In [7]:
VECTORSTORE_DIR = Path("../data/vectorstore")
VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)

faiss.write_index(index, str(VECTORSTORE_DIR / "faiss_index.bin"))
print("Saved FAISS index")

metadata = df.to_dict(orient="records")

with open(VECTORSTORE_DIR / "metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("Saved metadata")


Saved FAISS index
Saved metadata


In [8]:
def semantic_search(query, k=5):
    query_vec = model.encode([query]).astype("float32")
    distances, indices = index.search(query_vec, k)

    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "rank": i+1,
            "distance": float(distances[0][i]),
            "nco_2015_code": metadata[idx]["nco_2015_code"],
            "title": metadata[idx]["title"]
        })
    return results


semantic_search("electrician", k=10)


[{'rank': 1,
  'distance': 0.858837366104126,
  'nco_2015_code': '7411.0100',
  'title': 'Electrician, General'},
 {'rank': 2,
  'distance': 0.8636763691902161,
  'nco_2015_code': '2152.0400',
  'title': 'Electro-Optical Engineer'},
 {'rank': 3,
  'distance': 0.9083679914474487,
  'nco_2015_code': '9313.0501',
  'title': 'Helper Electrician'},
 {'rank': 4,
  'distance': 0.9191936254501343,
  'nco_2015_code': '8122.3500',
  'title': 'Galvanizer/Operator – Electroplating,'},
 {'rank': 5,
  'distance': 0.9567441940307617,
  'nco_2015_code': '7411.0600',
  'title': 'Electrician, Stage and Studio'},
 {'rank': 6,
  'distance': 0.9616852402687073,
  'nco_2015_code': '2111.0600',
  'title': 'Physicist, Electricity and Magnetism'},
 {'rank': 7,
  'distance': 0.9629936218261719,
  'nco_2015_code': '3123.0400',
  'title': 'Electrical Supervisor, Wiring,'},
 {'rank': 8,
  'distance': 0.9657101631164551,
  'nco_2015_code': '3113.0200',
  'title': 'Electrical Technician (High Voltage)'},
 {'rank': 9