In [5]:
!pip install requests beautifulsoup4 tqdm




In [6]:
import os

os.makedirs("scraper", exist_ok=True)
os.makedirs("data", exist_ok=True)


In [7]:
import pandas as pd

df = pd.read_excel("Gen_AI Dataset.xlsx")

assessment_urls = df["Assessment_url"].unique()
print(len(assessment_urls))


54


In [8]:
import pandas as pd
import json
import os
from urllib.parse import urlparse

DATASET_PATH = "Gen_AI Dataset.xlsx"
OUTPUT_PATH = "data/catalog.json"

os.makedirs("data", exist_ok=True)

def infer_name_from_url(url):
    path = urlparse(url).path
    slug = path.split("/")[-1]
    if not slug or len(slug) < 3:
        slug = path.split("/")[-2]
    return slug.replace("-", " ").replace("_", " ").title()

def infer_test_type(name):
    name = name.lower()
    if any(k in name for k in [
        "personality", "behavior", "behaviour",
        "motivation", "work style", "values"
    ]):
        return "Personality & Behavior"
    return "Knowledge & Skills"

def generate_description(name, test_type):
    if test_type == "Knowledge & Skills":
        return f"The {name} assessment evaluates technical and cognitive skills."
    else:
        return f"The {name} assessment evaluates behavioral and personality traits."

df = pd.read_excel(DATASET_PATH)
url_col = [c for c in df.columns if "url" in c.lower()][0]

urls = sorted(df[url_col].dropna().unique())

catalog = []
for url in urls:
    name = infer_name_from_url(url)
    test_type = infer_test_type(name)
    catalog.append({
        "name": name,
        "url": url,
        "description": generate_description(name, test_type),
        "test_type": test_type
    })

with open(OUTPUT_PATH, "w") as f:
    json.dump(catalog, f, indent=2)

print("Catalog rebuilt with", len(catalog), "assessments")
print(catalog[:3])


Catalog rebuilt with 54 assessments
[{'name': 'Business Communication Adaptive', 'url': 'https://www.shl.com/products/product-catalog/view/business-communication-adaptive/', 'description': 'The Business Communication Adaptive assessment evaluates technical and cognitive skills.', 'test_type': 'Knowledge & Skills'}, {'name': 'English Comprehension New', 'url': 'https://www.shl.com/products/product-catalog/view/english-comprehension-new/', 'description': 'The English Comprehension New assessment evaluates technical and cognitive skills.', 'test_type': 'Knowledge & Skills'}, {'name': 'Enterprise Leadership Report 2 0', 'url': 'https://www.shl.com/products/product-catalog/view/enterprise-leadership-report-2-0/', 'description': 'The Enterprise Leadership Report 2 0 assessment evaluates technical and cognitive skills.', 'test_type': 'Knowledge & Skills'}]


In [9]:
import json

with open("data/catalog.json") as f:
    catalog = json.load(f)

print("Total assessments:", len(catalog))
print(catalog[0])


Total assessments: 54
{'name': 'Business Communication Adaptive', 'url': 'https://www.shl.com/products/product-catalog/view/business-communication-adaptive/', 'description': 'The Business Communication Adaptive assessment evaluates technical and cognitive skills.', 'test_type': 'Knowledge & Skills'}


In [10]:
documents = []
metadatas = []

for a in catalog:
    text = f"""
    Assessment Name: {a['name']}
    Description: {a['description']}
    Test Type: {a['test_type']}
    """
    documents.append(text.strip())
    metadatas.append(a)

len(documents)


54

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    documents,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

(54, 384)


In [12]:
!pip install -q faiss-cpu
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print("Vectors in index:", index.ntotal)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hVectors in index: 54


In [13]:
def retrieve(query, k=5):
    q_emb = model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, k)
    return [metadatas[i] for i in I[0]]


In [14]:
query = "Hiring a Python developer with analytical and problem solving skills"
results = retrieve(query, k=5)

for r in results:
    print("-", r["name"], "|", r["test_type"])


- Python New | Knowledge & Skills
- Professional 7 1 Solution | Knowledge & Skills
- Professional 7 1 Solution | Knowledge & Skills
- Search Engine Optimization New | Knowledge & Skills
- Technical Sales Associate Solution | Knowledge & Skills


In [15]:
def build_rag_prompt(query, candidates):
    text = f"Job requirement:\n{query}\n\nAvailable assessments:\n"
    for i, c in enumerate(candidates, 1):
        text += (
            f"{i}. Name: {c['name']}\n"
            f"   Type: {c['test_type']}\n"
            f"   Description: {c['description']}\n\n"
        )

    text += """
Task:
- Select the most relevant 5 to 10 assessments
- Avoid duplicates
- Balance Knowledge & Skills and Personality & Behavior assessments
- Output JSON list with fields: name, url, test_type, justification
"""
    return text


In [16]:
!pip install -q google-generativeai


In [17]:
def rag_recommend_no_llm(query, k=20):
    retrieved = retrieve(query, k)

    seen = set()
    results = []
    has_p = False
    has_k = False

    for r in retrieved:
        if r["name"] in seen:
            continue
        seen.add(r["name"])

        if r["test_type"] == "Personality & Behavior":
            has_p = True
        if r["test_type"] == "Knowledge & Skills":
            has_k = True

        results.append({
            "name": r["name"],
            "url": r["url"],
            "test_type": r["test_type"],
            "justification": "Relevant based on semantic similarity to job requirements"
        })

        if len(results) >= 5 and has_p and has_k:
            break

    return results


In [19]:
query = "Hiring a Python developer who can also collaborate with teams"
print(rag_recommend_no_llm(query))


[{'name': 'Python New', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/python-new/', 'test_type': 'Knowledge & Skills', 'justification': 'Relevant based on semantic similarity to job requirements'}, {'name': 'Opq Team Types And Leadership Styles Report', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-team-types-and-leadership-styles-report', 'test_type': 'Knowledge & Skills', 'justification': 'Relevant based on semantic similarity to job requirements'}, {'name': 'Professional 7 1 Solution', 'url': 'https://www.shl.com/products/product-catalog/view/professional-7-1-solution/', 'test_type': 'Knowledge & Skills', 'justification': 'Relevant based on semantic similarity to job requirements'}, {'name': 'Sales Representative Solution', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/sales-representative-solution/', 'test_type': 'Knowledge & Skills', 'justification': 'Relevant based on semantic similarity to job requirements'}, 

In [20]:
import pandas as pd

xls = pd.ExcelFile("Gen_AI Dataset.xlsx")
print(xls.sheet_names)


['Train-Set', 'Test-Set']


In [21]:
import pandas as pd

test_df = pd.read_excel("Gen_AI Dataset.xlsx", sheet_name="Test-Set")
test_df.head()


Unnamed: 0,Query
0,Looking to hire mid-level professionals who ar...
1,Job Description\n\n Join a community that is s...
2,I am hiring for an analyst and wants applicati...
3,I have a JD Job Description\n\n People Science...
4,I am new looking for new graduates in my sales...


In [22]:
test_df.columns = [c.strip().lower() for c in test_df.columns]


In [23]:
print(test_df.columns)


Index(['query'], dtype='object')


In [24]:
def recommend_urls(query, k=10):
    """
    Returns top-k assessment URLs for a query
    using your NO-API RAG recommender
    """
    results = rag_recommend_no_llm(query, k=20)
    return [r["url"] for r in results][:k]


In [25]:
recommend_urls("Hiring a Python developer with teamwork skills")


['https://www.shl.com/solutions/products/product-catalog/view/python-new/',
 'https://www.shl.com/products/product-catalog/view/professional-7-1-solution/',
 'https://www.shl.com/solutions/products/product-catalog/view/professional-7-0-solution-3958/',
 'https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/',
 'https://www.shl.com/solutions/products/product-catalog/view/sales-representative-solution/',
 'https://www.shl.com/solutions/products/product-catalog/view/global-skills-assessment/',
 'https://www.shl.com/solutions/products/product-catalog/view/technical-sales-associate-solution/',
 'https://www.shl.com/solutions/products/product-catalog/view/opq-team-types-and-leadership-styles-report',
 'https://www.shl.com/solutions/products/product-catalog/view/tableau-new/',
 'https://www.shl.com/solutions/products/product-catalog/view/writex-email-writing-sales-new/']

In [26]:
rows = []

for query in test_df["query"]:
    urls = recommend_urls(query, k=10)
    for url in urls:
        rows.append({
            "Query": query,
            "Assessment_url": url
        })

import pandas as pd
submission_df = pd.DataFrame(rows)
submission_df.to_csv("submission.csv", index=False)

print("submission.csv created")
submission_df.head(10)


submission.csv created


Unnamed: 0,Query,Assessment_url
0,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
1,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
2,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
3,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
4,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
5,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
6,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
7,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
8,Looking to hire mid-level professionals who ar...,https://www.shl.com/products/product-catalog/v...
9,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...


In [28]:
import pandas as pd
import os

os.makedirs("data", exist_ok=True)

train_df = pd.read_excel("Gen_AI Dataset.xlsx", sheet_name="Train-Set")
test_df  = pd.read_excel("Gen_AI Dataset.xlsx", sheet_name="Test-Set")

print(train_df.head())
print(test_df.head())


                                               Query  \
0  I am hiring for Java developers who can also c...   
1  I am hiring for Java developers who can also c...   
2  I am hiring for Java developers who can also c...   
3  I am hiring for Java developers who can also c...   
4  I am hiring for Java developers who can also c...   

                                      Assessment_url  
0  https://www.shl.com/solutions/products/product...  
1  https://www.shl.com/solutions/products/product...  
2  https://www.shl.com/solutions/products/product...  
3  https://www.shl.com/solutions/products/product...  
4  https://www.shl.com/products/product-catalog/v...  
                                               Query
0  Looking to hire mid-level professionals who ar...
1  Job Description\n\n Join a community that is s...
2  I am hiring for an analyst and wants applicati...
3  I have a JD Job Description\n\n People Science...
4  I am new looking for new graduates in my sales...


In [29]:
train_df.columns = [c.strip().lower() for c in train_df.columns]
test_df.columns  = [c.strip().lower() for c in test_df.columns]

print(train_df.columns)
print(test_df.columns)


Index(['query', 'assessment_url'], dtype='object')
Index(['query'], dtype='object')


In [30]:
train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

print("Files created:")
print(os.listdir("data"))


Files created:
['catalog.json', 'test.csv', 'train.csv']


In [31]:
from google.colab import files

files.download("data/train.csv")
files.download("data/test.csv")
files.download("data/submission.csv")  # also download this


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

FileNotFoundError: Cannot find file: data/submission.csv