#1. Data Scraping Phase


In [None]:
!pip install requests beautifulsoup4 pandas




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
BASE_URL = "https://www.shl.com/products/product-catalog/"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}


In [None]:
def fetch_assessment_details(assessment):
    url = assessment["url"]

    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        if response.status_code != 200:
            return assessment

        soup = BeautifulSoup(response.text, "html.parser")

        description = "N/A"
        duration = "N/A"

        rows = soup.find_all(
            "div",
            class_="product-catalogue-training-calendar__row typ"
        )

        for row in rows:
            header = row.find("h4")
            if not header:
                continue

            title = header.get_text(strip=True).lower()

            # ---- Description ----
            if title == "description":
                p = row.find("p")
                if p:
                    description = p.get_text(" ", strip=True)

            # ---- Duration ----
            if title == "assessment length":
                p = row.find("p")
                if p:
                    text = p.get_text(" ", strip=True).lower()
                    match = re.search(r"=\s*(\d+)", text)
                    if match:
                        duration = f"{match.group(1)} minutes"

        assessment["description"] = description
        assessment["duration"] = duration

    except Exception as e:
        print(f"❌ Error fetching {url}: {e}")

    return assessment


In [None]:
def scrape_table(table):
    assessments = []
    rows = table.find_all("tr")[1:]  # skip header

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue

        # ---- Name & URL ----
        name_tag = cols[0].find("a")
        name = name_tag.text.strip() if name_tag else "Unknown"
        url = "https://www.shl.com" + name_tag["href"] if name_tag else ""

        # ---- Remote Testing ----
        remote_testing = "Yes" if cols[1].find("span", class_="catalogue__circle -yes") else "No"

        # ---- Adaptive / IRT ----
        adaptive_irt = "Yes" if cols[2].find("span", class_="catalogue__circle -yes") else "No"

        # ---- Test Type ----
        test_keys = cols[3].find_all("span", class_="product-catalogue__key")
        test_type = ", ".join(k.text.strip() for k in test_keys) if test_keys else "N/A"

        assessments.append({
            "name": name,
            "url": url,
            "test_type": test_type,
            "remote_testing": remote_testing,
            "adaptive_irt": adaptive_irt,
            "duration": "N/A",
            "description": "N/A"
        })

    return assessments


In [None]:
def scrape_pages_for_type(type_param, max_pages, label):
    all_assessments = []

    for start in range(0, max_pages * 12, 12):
        url = f"{BASE_URL}?start={start}&type={type_param}"
        print(f"[{label}] Scraping → {url}")

        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"[{label}] ❌ Failed to fetch page")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")

        if not table:
            print(f"[{label}] 🚫 No table found. Stopping.")
            break

        assessments = scrape_table(table)
        if not assessments:
            break

        all_assessments.extend(assessments)
        time.sleep(1)  # polite scraping

    return all_assessments


In [None]:
def scrape_shl_catalog():
    print("🔍 Scraping Pre-packaged Job Solutions...")
    prepackaged = scrape_pages_for_type(type_param=2, max_pages=12, label="Pre-packaged")

    print("\n🔍 Scraping Individual Test Solutions...")
    individual = scrape_pages_for_type(type_param=1, max_pages=32, label="Individual")

    all_assessments = prepackaged + individual
    print(f"\n📦 Total assessments found: {len(all_assessments)}")

    print("\n🔍 Fetching descriptions & durations...")
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(fetch_assessment_details, a) for a in all_assessments]

        for i, future in enumerate(as_completed(futures), 1):
            if i % 10 == 0:
                print(f"Progress: {i}/{len(all_assessments)}")

    return pd.DataFrame(all_assessments)


In [None]:
df = scrape_shl_catalog()
df.head()


🔍 Scraping Pre-packaged Job Solutions...
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=0&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=12&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=24&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=36&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=48&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=60&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=72&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=84&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=96&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=108&type=2
[Pre-packaged] Scraping → https://www.shl.com/products/product-catalog/?start=1

Unnamed: 0,name,url,test_type,remote_testing,adaptive_irt,duration,description
0,Account Manager Solution,https://www.shl.com/products/product-catalog/v...,"C, P, A, B",Yes,Yes,49 minutes,The Account Manager solution is an assessment ...
1,Administrative Professional - Short Form,https://www.shl.com/products/product-catalog/v...,"A, K, P",Yes,Yes,36 minutes,The Administrative Professional solution is fo...
2,Agency Manager Solution,https://www.shl.com/products/product-catalog/v...,"A, B, P, S",Yes,Yes,51 minutes,The Agency Manager solution is for mid-level s...
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/products/product-catalog/v...,"B, P",Yes,No,30 minutes,The Apprentice + 8.0 Job-Focused Assessment is...
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/products/product-catalog/v...,"B, P",Yes,No,20 minutes,The Apprentice 8.0 Job-Focused Assessment is a...


In [None]:
output_file = "shl_assessments.csv"
df.to_csv(output_file, index=False)
print(f"✅ Saved {len(df)} records to {output_file}")


✅ Saved 518 records to shl_assessments.csv


#Clean the Scraped Data

In [None]:
import pandas as pd
import re
from google.colab import files

uploaded = files.upload()
input_path = list(uploaded.keys())[0]

df = pd.read_csv(input_path)

df = df.drop_duplicates(subset=["name", "url"]).reset_index(drop=True)

def clean_test_type(x):
    if pd.isna(x):
        return None
    parts = [p.strip() for p in str(x).split(",")]
    return ",".join(sorted(set(parts)))

df["test_type"] = df["test_type"].apply(clean_test_type)

binary_map = {"Yes": 1, "No": 0}
df["remote_testing"] = df["remote_testing"].map(binary_map)
df["adaptive_irt"] = df["adaptive_irt"].map(binary_map)

def clean_duration(x):
    if pd.isna(x) or x == "N/A":
        return None
    m = re.search(r"\d+", str(x))
    return int(m.group()) if m else None

df["duration_minutes"] = df["duration"].apply(clean_duration)
df.drop(columns=["duration"], inplace=True)

def clean_text(x):
    if pd.isna(x):
        return None
    return re.sub(r"\s+", " ", str(x)).strip()

df["description"] = df["description"].apply(clean_text)
df["desc_length"] = df["description"].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

df = df.dropna(subset=["name", "url", "test_type"])

output_path = "shl_assessments_clean.csv"
df.to_csv(output_path, index=False)

print("✅ Cleaning completed")
print("Rows:", len(df))
print("Saved as:", output_path)

files.download(output_path)


Saving shl_assessments (3).csv to shl_assessments (3) (1).csv
✅ Cleaning completed
Rows: 506
Saved as: shl_assessments_clean.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#2. Processed Phase(Build Embedding and Metadata)

In [None]:
!pip install -q sentence-transformers
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from google.colab import files

uploaded = files.upload()
input_path = list(uploaded.keys())[0]

df = pd.read_csv(input_path)

df = df.dropna(subset=["description"]).reset_index(drop=True)

texts = df["description"].tolist()

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

np.save("shl_embeddings.npy", embeddings)

metadata = df.drop(columns=["description"])
metadata.to_csv("shl_metadata.csv", index=False)

print("Embeddings shape:", embeddings.shape)
print("Files generated:")
print(" - shl_embeddings.npy")
print(" - shl_metadata.csv")

files.download("shl_embeddings.npy")
files.download("shl_metadata.csv")




Saving shl_assessments_clean.csv to shl_assessments_clean (1).csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Embeddings shape: (506, 384)
Files generated:
 - shl_embeddings.npy
 - shl_metadata.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
emb = np.load("shl_embeddings.npy")
print(emb.shape)
print(np.linalg.norm(emb[0]))


(506, 384)
1.0


# PHASE 3A — RETRIEVAL EVALUATION (SLUG-BASED)

In [None]:
!pip install -q faiss-cpu sentence-transformers openpyxl
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from google.colab import files
import re

uploaded = files.upload()

embeddings = np.load("shl_embeddings.npy")
metadata = pd.read_csv("shl_metadata.csv")
labels = pd.read_excel("Gen_AI Dataset (2).xlsx")

labels = labels.rename(columns={
    "Query": "query",
    "Assessment_url": "url"
})

def extract_slug(url):
    if not isinstance(url, str):
        return None
    url = url.lower()
    m = re.search(r"/view/([^/]+)/?", url)
    return m.group(1) if m else None

metadata["slug"] = metadata["url"].apply(extract_slug)
labels["slug"] = labels["url"].apply(extract_slug)

metadata = metadata.dropna(subset=["slug"])
labels = labels.dropna(subset=["slug"])

labels = labels.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = labels.iloc[:20]
test_df = labels.iloc[20:]

model = SentenceTransformer("all-MiniLM-L6-v2")

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

def retrieve_slugs(query, k=5):
    q_emb = model.encode([query], normalize_embeddings=True)
    _, indices = index.search(q_emb, k)
    return metadata.iloc[indices[0]]["slug"].tolist()

def evaluate(df, k=5):
    recall_hits = 0
    mrr_total = 0

    for _, row in df.iterrows():
        retrieved = retrieve_slugs(row["query"], k)
        gt_slug = row["slug"]

        if gt_slug in retrieved:
            recall_hits += 1
            rank = retrieved.index(gt_slug) + 1
            mrr_total += 1 / rank

    return recall_hits / len(df), mrr_total / len(df)

train_recall, train_mrr = evaluate(train_df)
test_recall, test_mrr = evaluate(test_df)

print("PHASE 3A — RETRIEVAL EVALUATION (SLUG-BASED)")
print("==========================================")
print(f"Train Queries: {len(train_df)}")
print(f"Test Queries : {len(test_df)}\n")
print(f"Train Recall@5: {train_recall:.4f}")
print(f"Train MRR     : {train_mrr:.4f}\n")
print(f"Test Recall@5 : {test_recall:.4f}")
print(f"Test MRR      : {test_mrr:.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25h



Saving Gen_AI Dataset (2).xlsx to Gen_AI Dataset (2) (1).xlsx
Saving shl_metadata.csv to shl_metadata (1).csv
Saving shl_embeddings.npy to shl_embeddings.npy


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

PHASE 3A — RETRIEVAL EVALUATION (SLUG-BASED)
Train Queries: 20
Test Queries : 45

Train Recall@5: 0.0500
Train MRR     : 0.0250

Test Recall@5 : 0.0667
Test MRR      : 0.0185


In [None]:
import pandas as pd
import numpy as np
from google.colab import files

uploaded = files.upload()

labels = pd.read_excel("Gen_AI Dataset (2).xlsx")
metadata = pd.read_csv("shl_metadata.csv")

labels = labels.rename(columns={
    "Query": "query",
    "Assessment_url": "url"
})

print("Sample labeled URLs:")
print(labels["url"].head(5).tolist())

print("\nSample metadata URLs:")
print(metadata["url"].head(5).tolist())


Saving Gen_AI Dataset (2).xlsx to Gen_AI Dataset (2).xlsx
Saving shl_metadata.csv to shl_metadata.csv
Sample labeled URLs:
['https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/', 'https://www.shl.com/solutions/products/product-catalog/view/core-java-entry-level-new/', 'https://www.shl.com/solutions/products/product-catalog/view/java-8-new/', 'https://www.shl.com/solutions/products/product-catalog/view/core-java-advanced-level-new/', 'https://www.shl.com/products/product-catalog/view/interpersonal-communications/']

Sample metadata URLs:
['https://www.shl.com/products/product-catalog/view/account-manager-solution/', 'https://www.shl.com/products/product-catalog/view/administrative-professional-short-form/', 'https://www.shl.com/products/product-catalog/view/agency-manager-solution/', 'https://www.shl.com/products/product-catalog/view/apprentice-8-0-job-focused-assessment-4261/', 'https://www.shl.com/products/product-catalog/view/apprentice-8-0-job-focused-asses

#PHASE 3B — LLM QUERY UNDERSTANDING + RETRIEVAL

In [None]:
!pip install -q faiss-cpu sentence-transformers transformers openpyxl
import numpy as np
import pandas as pd
import faiss
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from google.colab import files

uploaded = files.upload()

embeddings = np.load("shl_embeddings.npy")
metadata = pd.read_csv("shl_metadata.csv")
labels = pd.read_excel("Gen_AI Dataset (2).xlsx")

labels = labels.rename(columns={
    "Query": "query",
    "Assessment_url": "url"
})

def extract_slug(url):
    if not isinstance(url, str):
        return None
    m = re.search(r"/view/([^/]+)/?", url.lower())
    return m.group(1) if m else None

metadata["slug"] = metadata["url"].apply(extract_slug)
labels["slug"] = labels["url"].apply(extract_slug)

metadata = metadata.dropna(subset=["slug"])
labels = labels.dropna(subset=["slug"])

labels = labels.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = labels.iloc[:20]
test_df = labels.iloc[20:]

embedder = SentenceTransformer("all-MiniLM-L6-v2")

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

rewriter = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=128
)

def rewrite_query(query):
    prompt = (
        "You are an expert in talent assessment systems.\n"
        "Rewrite the following user query into a detailed assessment description "
        "that would appear in a professional HR assessment catalog.\n\n"
        f"Query: {query}\n\nRewritten description:"
    )
    return rewriter(prompt, do_sample=False)[0]["generated_text"]

def retrieve_slugs(query, k=5):
    rewritten = rewrite_query(query)
    q_emb = embedder.encode([rewritten], normalize_embeddings=True)
    _, indices = index.search(q_emb, k)
    return metadata.iloc[indices[0]]["slug"].tolist()

def evaluate(df, k=5):
    recall_hits = 0
    mrr_total = 0

    for _, row in df.iterrows():
        retrieved = retrieve_slugs(row["query"], k)
        if row["slug"] in retrieved:
            recall_hits += 1
            rank = retrieved.index(row["slug"]) + 1
            mrr_total += 1 / rank

    return recall_hits / len(df), mrr_total / len(df)

train_recall, train_mrr = evaluate(train_df)
test_recall, test_mrr = evaluate(test_df)

print("PHASE 3B — LLM QUERY UNDERSTANDING + RETRIEVAL")
print("==============================================")
print(f"Train Queries: {len(train_df)}")
print(f"Test Queries : {len(test_df)}\n")
print(f"Train Recall@5: {train_recall:.4f}")
print(f"Train MRR     : {train_mrr:.4f}\n")
print(f"Test Recall@5 : {test_recall:.4f}")
print(f"Test MRR      : {test_mrr:.4f}")


Saving Gen_AI Dataset (2).xlsx to Gen_AI Dataset (2) (2).xlsx
Saving shl_metadata.csv to shl_metadata (2).csv
Saving shl_embeddings.npy to shl_embeddings (1).npy


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (743 > 512). Running this sequence through the model will result in indexing errors


PHASE 3B — LLM QUERY UNDERSTANDING + RETRIEVAL
Train Queries: 20
Test Queries : 45

Train Recall@5: 0.0500
Train MRR     : 0.0250

Test Recall@5 : 0.0667
Test MRR      : 0.0333


#PHASE 3C — LLM RE-RANKING (FINAL)

In [None]:
!pip install -q faiss-cpu sentence-transformers transformers openpyxl
import numpy as np
import pandas as pd
import faiss
import re
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from google.colab import files

uploaded = files.upload()

embeddings = np.load("shl_embeddings.npy")
metadata = pd.read_csv("shl_metadata.csv")
labels = pd.read_excel("Gen_AI Dataset (2).xlsx")

labels = labels.rename(columns={
    "Query": "query",
    "Assessment_url": "url"
})

def extract_slug(url):
    if not isinstance(url, str):
        return None
    m = re.search(r"/view/([^/]+)/?", url.lower())
    return m.group(1) if m else None

metadata["slug"] = metadata["url"].apply(extract_slug)
labels["slug"] = labels["url"].apply(extract_slug)

metadata = metadata.dropna(subset=["slug"])
labels = labels.dropna(subset=["slug"])

labels = labels.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = labels.iloc[:20]
test_df = labels.iloc[20:]

embedder = SentenceTransformer("all-MiniLM-L6-v2")

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)

def retrieve_candidates(query, k=10):
    q_emb = embedder.encode([query], normalize_embeddings=True)
    _, indices = index.search(q_emb, k)
    return metadata.iloc[indices[0]].reset_index(drop=True)

def rerank(query, candidates):
    prompt = "You are an expert in talent assessment selection.\n"
    prompt += f"Query: {query}\n\nCandidates:\n"

    for i, row in candidates.iterrows():
        prompt += (
            f"{i+1}. Name: {row['name']}\n"
            f"   Test Types: {row['test_type']}\n"
            f"   Remote Testing: {row['remote_testing']}\n"
            f"   Adaptive IRT: {row['adaptive_irt']}\n"
            f"   Duration (minutes): {row.get('duration_minutes', 'N/A')}\n\n"
        )

    prompt += "Rank the candidates from most relevant to least relevant using indices only."

    output = llm(prompt, do_sample=False)[0]["generated_text"]
    indices = re.findall(r"\d+", output)
    indices = [int(i)-1 for i in indices if 0 <= int(i)-1 < len(candidates)]

    return indices

def evaluate(df, k=5):
    recall_hits = 0
    mrr_total = 0

    for _, row in df.iterrows():
        candidates = retrieve_candidates(row["query"], k=10)
        order = rerank(row["query"], candidates)

        if not order:
            continue

        ranked_slugs = candidates.iloc[order]["slug"].tolist()

        if row["slug"] in ranked_slugs[:k]:
            recall_hits += 1
            rank = ranked_slugs.index(row["slug"]) + 1
            mrr_total += 1 / rank

    return recall_hits / len(df), mrr_total / len(df)

train_recall, train_mrr = evaluate(train_df)
test_recall, test_mrr = evaluate(test_df)

print("PHASE 3C — LLM RE-RANKING (FINAL)")
print("================================")
print(f"Train Recall@5: {train_recall:.4f}")
print(f"Train MRR     : {train_mrr:.4f}\n")
print(f"Test Recall@5 : {test_recall:.4f}")
print(f"Test MRR      : {test_mrr:.4f}")


Saving Gen_AI Dataset (2).xlsx to Gen_AI Dataset (2) (4).xlsx
Saving shl_metadata.csv to shl_metadata (4).csv
Saving shl_embeddings.npy to shl_embeddings (3).npy


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1101 > 512). Running this sequence through the model will result in indexing errors


PHASE 3C — LLM RE-RANKING (FINAL)
Train Recall@5: 0.0000
Train MRR     : 0.0000

Test Recall@5 : 0.0667
Test MRR      : 0.0185


In [None]:
import faiss
import numpy as np

embeddings = np.load("shl_embeddings.npy").astype("float32")

# Normalize embeddings BEFORE adding
faiss.normalize_L2(embeddings)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 506


In [None]:
faiss.write_index(index, "faiss_index.bin")


In [3]:
# app.py
!pip install -q faiss-cpu sentence-transformers openpyxl
import re
import faiss
import numpy as np
import pandas as pd
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from transformers import pipeline

app = FastAPI(
    title="SHL Assessment Recommendation API",
    version="1.0"
)

print("Loading models and data...")

metadata = pd.read_csv("/content/shl_metadata.csv")

def extract_slug(url):
    if not isinstance(url, str):
        return None
    m = re.search(r"/view/([^/]+)/?", url.lower())
    return m.group(1) if m else None

metadata["slug"] = metadata["url"].apply(extract_slug)
metadata = metadata.dropna(subset=["slug"]).reset_index(drop=True)

faiss_index = faiss.read_index("/content/faiss_index.bin")

embedder = SentenceTransformer("all-MiniLM-L6-v2")

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)

print("Startup complete.")

class QueryRequest(BaseModel):
    query: str

def retrieve_candidates(query: str, k: int = 10) -> pd.DataFrame:
    q_emb = embedder.encode([query], normalize_embeddings=True)
    _, indices = faiss_index.search(q_emb, k)
    return metadata.iloc[indices[0]].reset_index(drop=True)

def rerank(query: str, candidates: pd.DataFrame):
    prompt = "You are an expert in talent assessment selection.\n"
    prompt += f"Query: {query}\n\nCandidates:\n"

    for i, row in candidates.iterrows():
        prompt += (
            f"{i+1}. Name: {row['name']}\n"
            f"   Test Types: {row['test_type']}\n"
            f"   Remote Testing: {row['remote_testing']}\n"
            f"   Adaptive IRT: {row['adaptive_irt']}\n"
            f"   Duration (minutes): {row.get('duration_minutes', 'N/A')}\n\n"
        )

    prompt += "Rank the candidates from most relevant to least relevant using indices only."

    output = llm(prompt, do_sample=False)[0]["generated_text"]
    indices = re.findall(r"\d+", output)

    ranked = []
    for i in indices:
        idx = int(i) - 1
        if 0 <= idx < len(candidates) and idx not in ranked:
            ranked.append(idx)

    return ranked

def recommend(query: str, top_k: int = 5):
    candidates = retrieve_candidates(query, k=10)
    order = rerank(query, candidates)

    if not order:
        candidates = candidates.head(top_k)
    else:
        candidates = candidates.iloc[order].head(top_k)

    results = []
    for _, row in candidates.iterrows():
        results.append({
            "url": row["url"],
            "name": row["name"],
            "adaptive_support": "Yes" if row["adaptive_irt"] else "No",
            "description": row.get("description", ""),
            "duration": int(row["duration_minutes"]) if not pd.isna(row.get("duration_minutes")) else None,
            "remote_support": "Yes" if row["remote_testing"] else "No",
            "test_type": (
                row["test_type"].split(",")
                if isinstance(row["test_type"], str)
                else []
            )
        })

    return results

@app.get("/health")
def health_check():
    return {"status": "healthy"}

@app.post("/recommend")
def recommend_endpoint(req: QueryRequest):
    if not req.query or not req.query.strip():
        raise HTTPException(status_code=400, detail="Query field is required")

    recommendations = recommend(req.query)

    if not recommendations:
        raise HTTPException(status_code=404, detail="No recommendations found")

    return {
        "recommended_assessments": recommendations
    }


Loading models and data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Startup complete.


In [6]:
!pip install fastapi uvicorn
!uvicorn app:app --host 0.0.0.0 --port 8000


[32mINFO[0m:     Started server process [[36m1533[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Finished server process [[36m1533[0m]
[31mERROR[0m:    Traceback (most recent call last):
  File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/asyncio/base_events.py", line 678, in run_until_complete
    self.run_forever()
  File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever
    self._run_once()
  File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once
    handle._run()
  File "/usr/lib/pytho

In [None]:
!ls


 app.py			       'shl_embeddings (2).npy'
 faiss_index.bin	       'shl_embeddings (3).npy'
'Gen_AI Dataset (2) (1).xlsx'   shl_embeddings.npy
'Gen_AI Dataset (2) (2).xlsx'  'shl_metadata (1).csv'
'Gen_AI Dataset (2) (3).xlsx'  'shl_metadata (2).csv'
'Gen_AI Dataset (2) (4).xlsx'  'shl_metadata (3).csv'
'Gen_AI Dataset (2).xlsx'      'shl_metadata (4).csv'
 sample_data		        shl_metadata.csv
'shl_embeddings (1).npy'


In [5]:
%%writefile app.py
from fastapi import FastAPI

app = FastAPI()

@app.get("/health")
def health():
    return {"status": "ok"}


Writing app.py


In [9]:
!curl http://127.0.0.1:8000


curl: (7) Failed to connect to 127.0.0.1 port 8000 after 0 ms: Connection refused


In [8]:
!curl http://127.0.0.1:8000


!curl http://127.0.0.1:8000
curl: (7) Failed to connect to 127.0.0.1 port 8000 after 0 ms: Connection refused
