<a href="https://colab.research.google.com/github/rani227/rag-based-recommendation/blob/main/shl_recommendation_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q google-generativeai pandas sentence-transformers scikit-learn faiss-cpu rapidfuzz python-dotenv beautifulsoup4 requests pydantic

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import json
import os
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import google.generativeai as genai
from pydantic import BaseModel, Field, ValidationError, HttpUrl
from typing import List, Optional, Dict, Union

In [None]:
class DurationFilter(BaseModel):
    min: Optional[int] = None
    max: Optional[int] = None

In [None]:
class ParsedQuery(BaseModel):
    skills: List[str] = Field(default_factory=list)
    soft_skills: List[str] = Field(default_factory=list)
    duration_minutes: Optional[DurationFilter] = None
    adaptive_required: Optional[bool] = None
    remote_required: Optional[bool] = None

In [None]:
class AssessmentCatalogEntry(BaseModel):
    name: str
    url: HttpUrl
    description: str
    test_type: str
    duration_minutes: int
    remote_support: bool
    adaptive_support: bool
    embedding_input: str # This field is generated internally

In [None]:
class RecommendedAssessment(BaseModel):
    assessment_name: str
    assessment_url: HttpUrl
    remote_testing_support: str # "Yes" or "No"
    adaptive_irt_support: str # "Yes" or "No"
    duration: str # e.g., "30 minutes"
    test_type: str
    relevance_reason: str # New field for traceability

In [None]:
class RecommendationInput(BaseModel):
    query: Optional[str] = None
    job_description_url: Optional[HttpUrl] = None
    num_recommendations: int = Field(default=10, ge=1, le=10)

    # Custom validator to ensure at least one of query or job_description_url is provided
    @classmethod
    def __pydantic_validator__(cls, value):
        if not isinstance(value, dict):
            raise ValueError("Input must be a dictionary")
        if not value.get("query") and not value.get("job_description_url"):
            raise ValueError("Either 'query' or 'job_description_url' must be provided.")
        return value

In [None]:
try:
    data_df = pd.read_csv('shl_dataset.csv')
except FileNotFoundError:
    print("Error: 'shl_dataset.csv' not found.")

In [None]:
data_df['remote_support'] = data_df['remote_support'].apply(lambda x: x.lower() == 'yes')
data_df['adaptive_support'] = data_df['adaptive_support'].apply(lambda x: x.lower() == 'yes')

In [None]:
data_df["embedding_input"] = data_df.apply(
    lambda row: f"{row['name']}. {row['description']}. Type: {row['test_type']}. Duration: {row['duration_minutes']} mins. Remote Support: {row['remote_support']}. Adaptive Support: {row['adaptive_support']}.", axis=1
)

In [None]:
# Convert DataFrame rows to Pydantic models for type safety
catalog_entries = [AssessmentCatalogEntry(**row.to_dict()) for index, row in data_df.iterrows()]
# Create a mapping from FAISS index to original data_df index/Pydantic object
# This is crucial for retrieving the full data after FAISS search
catalog_map = {i: entry for i, entry in enumerate(catalog_entries)}

In [None]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embeddings = embedding_model.encode([entry.embedding_input for entry in catalog_entries], show_progress_bar=True)
normalized_embeddings = normalize(embeddings, axis=1, norm='l2')

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print("Creating FAISS index...")
embedding_dim = normalized_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(normalized_embeddings)
print("FAISS index created and populated.")

Creating FAISS index...
FAISS index created and populated.


In [None]:
from google.colab import userdata

gemini_api_key = userdata.get('GOOGLE_API_KEY')

if not gemini_api_key:
    raise ValueError("GOOGLE_API_KEY not found. Please set it in your Colab secrets.")

genai.configure(api_key=gemini_api_key)

In [None]:
genai.configure(api_key=gemini_api_key)

In [None]:
def parse_query_with_llm_gemini(query: str) -> ParsedQuery:
    """
    Uses Gemini LLM to parse a natural language query into structured filters.
    Returns a ParsedQuery Pydantic object.
    """
    few_shot_examples = """
Q: I'm looking for a Java test that assesses both technical and soft skills under 40 minutes.
A:
```json
{
  "skills": ["java"],
  "soft_skills": ["communication", "teamwork"],
  "duration_minutes": { "max": 40 },
  "adaptive_required": false,
  "remote_required": false
}
```

Q: Need an adaptive test for Python developers that works well for remote hiring.
A:
```json
{
  "skills": ["python"],
  "soft_skills": [],
  "duration_minutes": null,
  "adaptive_required": true,
  "remote_required": true
}
```

Q: I need a test for entry-level sales roles, focusing on communication, and it must support remote testing. Duration should be around 25 minutes.
A:
```json
{
  "skills": ["sales"],
  "soft_skills": ["communication"],
  "duration_minutes": { "max": 30, "min": 20 },
  "adaptive_required": false,
  "remote_required": true
}
```

Q: Find me a test for senior managers, adaptive support is a must.
A:
```json
{
  "skills": ["management", "leadership"],
  "soft_skills": [],
  "duration_minutes": null,
  "adaptive_required": true,
  "remote_required": null
}
```

Q: """ + query + """
A:
"""
    model = genai.GenerativeModel('gemini-1.5-flash') # Using gemini-1.5-flash for speed and cost-effectiveness

    prompt_content = f"""
You are an expert at extracting structured filters from hiring assessment queries.
Extract the relevant information as a JSON object.
Ensure the JSON is always valid and complete, even if some fields are null or empty lists.
For duration_minutes, if a specific duration is mentioned, try to infer a reasonable min/max range if not explicitly stated, otherwise use null.
For 'remote_required' and 'adaptive_required', infer 'true' or 'false' if explicitly mentioned, otherwise use 'null'.

{few_shot_examples}
"""
    try:
        response = model.generate_content(
            prompt_content,
            generation_config=genai.types.GenerationConfig(
                temperature=0,  # Keep temperature low for structured output
                response_mime_type='application/json' # Explicitly request JSON output
            )
        )
        content = response.text.strip()
        parsed_dict = json.loads(content)
        return ParsedQuery(**parsed_dict) # Validate with Pydantic
    except (json.JSONDecodeError, ValidationError) as e:
        print(f"LLM response could not be parsed or validated as JSON: {e}")
        print(f"Raw LLM content: {content}")
        return ParsedQuery() # Return empty ParsedQuery on failure
    except Exception as e:
        print(f"An error occurred during Gemini API call: {e}")
        return ParsedQuery() # Return empty ParsedQuery on failure

In [None]:
def is_valid_url(url: str) -> bool:
    """Checks if a string is a valid URL."""
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except ValueError:
        return False

In [None]:
def get_text_from_url(url: str) -> str:
    """
    Fetches content from a URL and extracts readable text.
    Includes basic error handling and text cleaning.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=15, headers=headers) # Added headers and increased timeout
        response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)

        soup = BeautifulSoup(response.text, 'html.parser')

        # Attempt to find common elements that hold main content
        main_content_tags = ['article', 'main', 'div', 'p', 'span']
        text_parts = []
        for tag_name in main_content_tags:
            for tag in soup.find_all(tag_name):
                # Heuristic: only consider tags with a reasonable amount of text
                # and avoid script/style tags
                if tag.name not in ['script', 'style'] and len(tag.get_text(strip=True)) > 50:
                    text_parts.append(tag.get_text(separator=' ', strip=True))

        if not text_parts: # Fallback if specific tags don't yield much
            text = soup.get_text(separator=' ', strip=True)
        else:
            text = ' '.join(text_parts)

        # Further clean the text: remove excessive whitespace, newlines
        text = ' '.join(text.split()).strip()

        # Simple truncation for extremely long pages to avoid overwhelming LLM
        if len(text) > 4000: # Limit to first 4000 characters for LLM processing
            text = text[:4000] + "..." # Indicate truncation

        return text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return ""
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return ""

In [None]:
def retrieve_candidates(query_embedding: np.ndarray, parsed_filters: ParsedQuery, top_k: int = 20) -> List[AssessmentCatalogEntry]:
    """
    Retrieves candidates using FAISS and applies hard filters based on parsed query.
    Returns a list of AssessmentCatalogEntry objects.
    """
    # Normalize query embedding for dot product similarity (FAISS IndexFlatIP expects L2 normalized vectors)
    query_embedding = normalize(query_embedding, axis=1, norm='l2')
    scores, indices = faiss_index.search(query_embedding, top_k)

    filtered_entries = []
    for idx, score in zip(indices[0], scores[0]):
        # Retrieve the original Pydantic object using the FAISS index
        entry = catalog_map.get(idx)
        if not entry:
            continue # Should not happen if catalog_map is correctly built

        # Apply hard filters dynamically using Pydantic model attributes
        if parsed_filters.duration_minutes:
            if parsed_filters.duration_minutes.max is not None and entry.duration_minutes > parsed_filters.duration_minutes.max:
                continue
            if parsed_filters.duration_minutes.min is not None and entry.duration_minutes < parsed_filters.duration_minutes.min:
                continue

        if parsed_filters.adaptive_required is not None:
            if parsed_filters.adaptive_required and not entry.adaptive_support:
                continue
            if not parsed_filters.adaptive_required and entry.adaptive_support:
                continue

        if parsed_filters.remote_required is not None:
            if parsed_filters.remote_required and not entry.remote_support:
                continue
            if not parsed_filters.remote_required and entry.remote_support:
                continue

        filtered_entries.append(entry) # Append the Pydantic object

    return filtered_entries

In [None]:
def generate_reasons_with_llm(
    query_text: str,
    parsed_filters: ParsedQuery,
    candidate_assessments: List[AssessmentCatalogEntry]
) -> List[Dict[str, str]]:
    """
    Uses Gemini LLM to generate relevance reasons for a list of candidate assessments.
    """
    if not candidate_assessments:
        return []

    assessments_info = []
    for i, ass in enumerate(candidate_assessments):
        assessments_info.append(
            f"Assessment {i+1}:\n"
            f"  Name: {ass.name}\n"
            f"  Description: {ass.description}\n"
            f"  Type: {ass.test_type}\n"
            f"  Duration: {ass.duration_minutes} mins\n"
            f"  Remote Support: {'Yes' if ass.remote_support else 'No'}\n"
            f"  Adaptive Support: {'Yes' if ass.adaptive_support else 'No'}\n"
        )
    assessments_str = "\n\n".join(assessments_info)

    # Include extracted skills in the prompt for better reasoning
    skills_str = ""
    if parsed_filters.skills:
        skills_str += f"Technical skills mentioned: {', '.join(parsed_filters.skills)}.\n"
    if parsed_filters.soft_skills:
        skills_str += f"Soft skills mentioned: {', '.join(parsed_filters.soft_skills)}.\n"

    prompt = f"""
Given the user's query/job description:
"{query_text}"

And the extracted key requirements:
{json.dumps(parsed_filters.dict(), indent=2)}

{skills_str}

Here are some potentially relevant SHL assessments:
{assessments_str}

Your task is to review these assessments and for each one, provide a concise reason (1-2 sentences) explaining its relevance to the user's query, specifically highlighting how it addresses the mentioned skills or requirements.
If an assessment is not relevant, do not include it in the output.
Output a JSON array where each object has "assessment_name" and "relevance_reason".
Ensure the JSON is always valid.

Example Output Format:
```json
[
  {{
    "assessment_name": "Assessment Name 1",
    "relevance_reason": "Reason for relevance 1."
  }},
  {{
    "assessment_name": "Assessment Name 2",
    "relevance_reason": "Reason for relevance 2."
  }}
]
```
"""
    model = genai.GenerativeModel('gemini-1.5-flash') # Or 'gemini-1.5-pro' for higher quality reasons

    try:
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.2, # A bit higher temperature for more natural language reasons
                response_mime_type='application/json'
            )
        )
        content = response.text.strip()
        reasons_list = json.loads(content)
        return reasons_list
    except (json.JSONDecodeError, ValidationError) as e:
        print(f"LLM response for reasons could not be parsed or validated as JSON: {e}")
        print(f"Raw LLM content: {content}")
        return []
    except Exception as e:
        print(f"An error occurred during Gemini API call for reasons: {e}")
        return []

In [None]:
def get_recommendations(input_data: RecommendationInput) -> List[RecommendedAssessment]:
    """
    Main function to get assessment recommendations.
    Accepts a Pydantic RecommendationInput object.
    Returns at most `num_recommendations` (min 1) as a list of RecommendedAssessment Pydantic objects.
    """
    # Validate input using Pydantic model
    try:
        validated_input = RecommendationInput(**input_data.model_dump())
    except ValidationError as e:
        print(f"Input validation error: {e.errors()}")
        return [] # Return empty list on validation failure

    query_text = validated_input.query
    if validated_input.job_description_url:
        print(f"Fetching text from URL: {validated_input.job_description_url}")
        extracted_text = get_text_from_url(str(validated_input.job_description_url)) # Convert HttpUrl to str
        if not extracted_text:
            print("Could not extract sufficient text from the provided URL.")
            return [] # Return empty list if URL text extraction fails
        query_text = extracted_text

    if not query_text:
        print("No valid query text could be derived from the input.")
        return []

    print(f"\nProcessing query: {query_text[:100]}...") # Print first 100 chars for brevity

    # Step 1: Parse query with LLM to extract structured filters
    parsed_query = parse_query_with_llm_gemini(query_text)
    print(f"✅ Parsed Filters: {json.dumps(parsed_query.model_dump(), indent=2)}")

    # Step 2: Embed the query text
    query_embedding = embedding_model.encode([query_text])

    # Step 3: Retrieve candidates using FAISS and apply hard filters
    # Retrieve more candidates than needed for better re-ranking potential
    candidates = retrieve_candidates(query_embedding, parsed_query, top_k=validated_input.num_recommendations * 3)

    if not candidates:
        print("⚠️ No matching assessments found after filtering. Attempting fallback to top semantic match.\n")
        scores, indices = faiss_index.search(normalize(query_embedding, axis=1), 1)
        if indices.size > 0:
            fallback_entry = catalog_map.get(indices[0][0])
            if fallback_entry:
                # Generate a generic reason for the fallback
                reasons_map = {fallback_entry.name: "This is the closest semantic match found in the catalog based on your query, even if it didn't meet all specific filters."}
                final_ranked_candidates = [fallback_entry]
            else:
                final_ranked_candidates = []
                reasons_map = {}
        else:
            final_ranked_candidates = []
            reasons_map = {}
    else:
        # Step 4: LLM-based Re-ranking and Reason Generation
        print(f"Generating reasons for {len(candidates)} candidates...")
        reasons_list = generate_reasons_with_llm(query_text, parsed_query, candidates)

        reasons_map = {item['assessment_name']: item['relevance_reason'] for item in reasons_list}

        # Filter candidates to only include those for which LLM provided a reason
        # And sort them based on the order provided by the LLM in reasons_list (if applicable)
        # For simplicity, we'll just take the top candidates that have a reason
        # A more complex re-ranking would involve the LLM explicitly sorting.

        # Create a list of (assessment, original_semantic_score) for sorting
        temp_candidates_with_scores = []
        for idx, score in zip(faiss_index.search(normalize(query_embedding, axis=1), validated_input.num_recommendations * 3)[1][0], faiss_index.search(normalize(query_embedding, axis=1), validated_input.num_recommendations * 3)[0][0]):
            entry = catalog_map.get(idx)
            if entry and entry in candidates and entry.name in reasons_map:
                temp_candidates_with_scores.append((entry, score))

        # Sort by original semantic score, then take top N
        final_ranked_candidates = [item[0] for item in sorted(temp_candidates_with_scores, key=lambda x: x[1], reverse=True)[:validated_input.num_recommendations]]

        # If LLM didn't return enough reasons, fill with top semantic matches that passed hard filters
        if len(final_ranked_candidates) < validated_input.num_recommendations and len(candidates) > 0:
            for cand in candidates:
                if cand not in final_ranked_candidates and cand.name not in reasons_map:
                    # Add a generic reason for these if LLM didn't process them
                    reasons_map[cand.name] = "This assessment is semantically relevant and meets your hard filters."
                    final_ranked_candidates.append(cand)
                if len(final_ranked_candidates) >= validated_input.num_recommendations:
                    break

        # Ensure minimum 1 recommendation, even if LLM gives no reasons
        if not final_ranked_candidates and len(data_df) > 0:
            print("Fallback: LLM provided no reasons. Returning top semantic match that passed hard filters, or general fallback.")
            if candidates: # If there were candidates that passed hard filters
                final_ranked_candidates = [candidates[0]]
                reasons_map[candidates[0].name] = "This is the top semantically relevant assessment that met your specified filters."
            else: # If no candidates passed hard filters, use the global semantic fallback
                scores, indices = faiss_index.search(normalize(query_embedding, axis=1), 1)
                if indices.size > 0:
                    fallback_entry = catalog_map.get(indices[0][0])
                    if fallback_entry:
                        final_ranked_candidates = [fallback_entry]
                        reasons_map[fallback_entry.name] = "This is the closest semantic match found in the catalog based on your query, even if it didn't meet all specific filters."


    output_data: List[RecommendedAssessment] = []
    for ass_entry in final_ranked_candidates:
        reason = reasons_map.get(ass_entry.name, "No specific reason generated, but deemed relevant by the system.")
        try:
            output_data.append(RecommendedAssessment(
                assessment_name=ass_entry.name,
                assessment_url=ass_entry.url,
                remote_testing_support="Yes" if ass_entry.remote_support else "No",
                adaptive_irt_support="Yes" if ass_entry.adaptive_support else "No",
                duration=f"{ass_entry.duration_minutes} minutes",
                test_type=ass_entry.test_type,
                relevance_reason=reason
            ))
        except ValidationError as e:
            print(f"Validation error for RecommendedAssessment: {e.errors()} for entry: {ass_entry.name}")
            continue # Skip this entry if it fails validation

    # Ensure min 1 recommendation if catalog is not empty
    if not output_data and len(data_df) > 0:
        print("Final fallback: No recommendations generated. Returning a general assessment.")
        fallback_entry = catalog_entries[0] # Pick the first entry as a last resort
        output_data.append(RecommendedAssessment(
            assessment_name=fallback_entry.name,
            assessment_url=fallback_entry.url,
            remote_testing_support="Yes" if fallback_entry.remote_support else "No",
            adaptive_irt_support="Yes" if fallback_entry.adaptive_support else "No",
            duration=f"{fallback_entry.duration_minutes} minutes",
            test_type=fallback_entry.test_type,
            relevance_reason="This is a general assessment provided as a fallback."
        ))
    elif not output_data and len(data_df) == 0:
        print("Error: Product catalog is empty. Cannot provide recommendations.")
        return [] # Return empty if no data at all

    return output_data

In [None]:
print("\n--- Testing with Natural Language Query (Java, communication, adaptive, <30min) ---")
input1 = RecommendationInput(
    query="I need an assessment for advanced Java developers that also evaluates their ability to communicate and work in teams. Preferably under 30 minutes and adaptive.",
    num_recommendations=5
)
recommendations1 = get_recommendations(input1)
print("\nRecommended Assessments (Query 1):")
print(json.dumps([r.model_dump() for r in recommendations1], indent=2, default=str))


--- Testing with Natural Language Query (Java, communication, adaptive, <30min) ---

Processing query: I need an assessment for advanced Java developers that also evaluates their ability to communicate a...
✅ Parsed Filters: {
  "skills": [
    "java"
  ],
  "soft_skills": [
    "communication",
    "teamwork"
  ],
  "duration_minutes": {
    "min": null,
    "max": 30
  },
  "adaptive_required": true,
  "remote_required": null
}
⚠️ No matching assessments found after filtering. Attempting fallback to top semantic match.


Recommended Assessments (Query 1):
[
  {
    "assessment_name": "Core Java (Entry Level) (New)",
    "assessment_url": "https://www.shl.com/solutions/products/product-catalog/view/core-java-entry-level-new/",
    "remote_testing_support": "Yes",
    "adaptive_irt_support": "No",
    "duration": "15 minutes",
    "test_type": "Knowledge & Skills",
    "relevance_reason": "This is the closest semantic match found in the catalog based on your query, even if it didn't m