In [None]:
!pip install pandas numpy tqdm faker
!pip install sentence-transformers faiss-cpu
!pip install transformers torch
!pip install scikit-learn xgboost
!pip install pygtrie

Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.2
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64

In [None]:
import pygtrie
import pandas as pd

# Load queries
queries_df = pd.read_csv("user_queries.csv")

# Build Trie
trie = pygtrie.CharTrie()
for _, row in queries_df.iterrows():
    trie[row['corrected_query'].lower()] = {
        'frequency': row['frequency'],
        'clicked_products': row['clicked_product_ids']
    }

def get_prefix_suggestions(prefix, k=5):
    prefix = prefix.lower()
    suggestions = [
        (key, val['frequency']) for key, val in trie.items(prefix)
    ]
    suggestions = sorted(suggestions, key=lambda x: -x[1])
    return [s[0] for s in suggestions[:k]]


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Unique corrected queries
query_texts = queries_df['corrected_query'].unique().tolist()

# Embed using SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
query_embeddings = model.encode(query_texts)

# Build FAISS index
index = faiss.IndexFlatL2(query_embeddings.shape[1])
index.add(np.array(query_embeddings))

def get_semantic_suggestions(query, k=3):
    query_emb = model.encode([query])
    _, idxs = index.search(np.array(query_emb), k)
    return [query_texts[i] for i in idxs[0]]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model="distilbert-base-uncased")

def get_bert_completion(prefix, k=3):
    masked_input = prefix + " [MASK]"
    results = fill_mask(masked_input, top_k=k)
    return [r['sequence'].replace('[CLS] ', '').replace(' [SEP]', '') for r in results]


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# Convert 'purchased' column from string to boolean
session_df['purchased'] = session_df['purchased'].map({'Yes': True, 'No': False})

# Now calculate clicks and purchases safely
clicks = session_df.groupby('query')['clicked_product_id'].count().rename('clicks')
purchases = session_df[session_df['purchased']].groupby('query')['clicked_product_id'].count().rename('purchases')

# Compute Click-Through Rate (CTR)
query_ctr = pd.concat([clicks, purchases], axis=1).fillna(0)
query_ctr['CTR'] = query_ctr['purchases'] / query_ctr['clicks']


In [None]:
import pandas as pd

# Load required datasets
session_df = pd.read_csv("session_log.csv")
product_catalog = pd.read_csv("product_catalog.csv")[['product_id', 'brand', 'price']]
realtime_info = pd.read_csv("realtime_product_info.csv")[['product_id', 'offer_strength']]
session_df = session_df.rename(columns={'clicked_product_id': 'product_id'})
# Merge all relevant info
merged = session_df.merge(product_catalog, on='product_id', how='left') \
                   .merge(realtime_info, on='product_id', how='left')

# Map offer_strength to numeric
strength_map = {'Low': 1, 'Medium': 2, 'High': 3}
merged['offer_strength_score'] = merged['offer_strength'].map(strength_map)

# Define persona tagging function
def assign_persona(df):
    persona = []

    # 1. Brand-Loyalist: majority interactions with same brand
    brand_counts = df['brand'].value_counts(normalize=True)
    if not brand_counts.empty and brand_counts.iloc[0] >= 0.7:
        persona.append("brand_loyalist")

    # 2. Budget-Friendly: avg price < 300
    if df['price'].mean() < 300:
        persona.append("budget_friendly")

    # 3. Deal-Seeker: high average offer strength
    if df['offer_strength_score'].mean() >= 2.5:
        persona.append("deal_seeker")

    return ','.join(persona) if persona else 'general'

# Group by session and assign persona
session_persona_df = merged.groupby('session_id').apply(assign_persona).reset_index()
session_persona_df.columns = ['session_id', 'persona']

print(session_persona_df.head())


  session_id                     persona
0      S0001              brand_loyalist
1      S0002  brand_loyalist,deal_seeker
2      S0003  brand_loyalist,deal_seeker
3      S0004              brand_loyalist
4      S0005              brand_loyalist


  session_persona_df = merged.groupby('session_id').apply(assign_persona).reset_index()


In [None]:
# utils/convert_to_spacy_format.py

import pandas as pd
import spacy
from spacy.training import Example

def convert_to_spacy_format(csv_path):
    df = pd.read_csv(csv_path)
    data = []

    for _, row in df.iterrows():
        text = row['query']
        entities = eval(row['entities'])  # Format: [(start, end, "LABEL")]
        data.append((text, {"entities": entities}))

    return data


In [None]:
import pandas as pd
import spacy
from spacy.training import Example

def convert_bio_to_spacy_format(csv_path):
    df = pd.read_csv(csv_path)

    data = []
    grouped = df.groupby("query_id")

    for query_id, group in grouped:
        tokens = group["token"].tolist()
        tags = group["tag"].tolist()

        text = " ".join(tokens)
        entities = []

        current_pos = 0
        for token, tag in zip(tokens, tags):
            start = text.find(token, current_pos)
            end = start + len(token)

            if tag.startswith("B-"):
                entity_label = tag[2:]
                entities.append((start, end, entity_label))

            current_pos = end

        data.append((text, {"entities": entities}))

    return data

def train_spacy_ner(data_path, output_dir="spacy_ner_model"):
    data = convert_bio_to_spacy_format(data_path)
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")

    for _, annotations in data:
        for ent in annotations["entities"]:
            ner.add_label(ent[2])

    optimizer = nlp.begin_training()

    for itn in range(20):
        for text, annotations in data:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.2, sgd=optimizer)

    nlp.to_disk(output_dir)
    print(f"✅ spaCy NER model saved to: {output_dir}")


In [None]:
train_spacy_ner("ner_dataset.csv")


✅ spaCy NER model saved to: spacy_ner_model


In [None]:
import spacy
import time

# ✅ Load your trained spaCy model
nlp = spacy.load("spacy_ner_model")

def extract_entities(text):
    start = time.time()
    doc = nlp(text)
    entities = {}
    for ent in doc.ents:
        entities.setdefault(ent.label_, []).append(ent.text)
    end = time.time()
    print(f"Query: '{text}'")
    print(f"Response: {{'query': '{text}', 'entities': {entities}}}")
    print(f"--- Prediction took: {end - start:.4f} seconds ---\n")

# ✅ Sample usage with multiple test queries
sample_queries = [
    "oneplus mobile",
    "i want a gaming laptop with 16gb ram",
    "addidas shoes",
    "red color shoes under 3000",
    "iphone 13 with best camera and storage",
    "can i get nike running shoes in blue"
]

for q in sample_queries:
    extract_entities(q)


Query: 'oneplus mobile'
Response: {'query': 'oneplus mobile', 'entities': {'BRAND': ['oneplus']}}
--- Prediction took: 0.0045 seconds ---

Query: 'i want a gaming laptop with 16gb ram'
Response: {'query': 'i want a gaming laptop with 16gb ram', 'entities': {'FEATURE': ['gaming'], 'PRODUCT': ['laptop']}}
--- Prediction took: 0.0121 seconds ---

Query: 'addidas shoes'
Response: {'query': 'addidas shoes', 'entities': {'PRODUCT': ['shoes']}}
--- Prediction took: 0.0045 seconds ---

Query: 'red color shoes under 3000'
Response: {'query': 'red color shoes under 3000', 'entities': {'PRODUCT': ['shoes'], 'PRICE': ['under']}}
--- Prediction took: 0.0041 seconds ---

Query: 'iphone 13 with best camera and storage'
Response: {'query': 'iphone 13 with best camera and storage', 'entities': {'PRODUCT': ['iphone']}}
--- Prediction took: 0.0044 seconds ---

Query: 'can i get nike running shoes in blue'
Response: {'query': 'can i get nike running shoes in blue', 'entities': {'BRAND': ['nike'], 'PRODUCT

In [None]:
!zip download.zip spacy_ner_model

  adding: spacy_ner_model/ (stored 0%)


In [None]:
from google.colab import files
files.download("download.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r spacy_ner_model.zip ./spacy_ner_model

  adding: spacy_ner_model/ (stored 0%)
  adding: spacy_ner_model/tokenizer (deflated 81%)
  adding: spacy_ner_model/vocab/ (stored 0%)
  adding: spacy_ner_model/vocab/strings.json (deflated 74%)
  adding: spacy_ner_model/vocab/lookups.bin (stored 0%)
  adding: spacy_ner_model/vocab/key2row (stored 0%)
  adding: spacy_ner_model/vocab/vectors.cfg (stored 0%)
  adding: spacy_ner_model/vocab/vectors (deflated 45%)
  adding: spacy_ner_model/ner/ (stored 0%)
  adding: spacy_ner_model/ner/moves (deflated 66%)
  adding: spacy_ner_model/ner/model (deflated 8%)
  adding: spacy_ner_model/ner/cfg (deflated 33%)
  adding: spacy_ner_model/meta.json (deflated 50%)
  adding: spacy_ner_model/config.cfg (deflated 59%)


Testing for million req. for second


In [None]:
!pip install fastapi uvicorn locust




In [None]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import time
nlp = spacy.load("spacy_ner_model")

app = FastAPI()
ner = FastNER()  # Load once at startup

class Query(BaseModel):
    text: str

@app.post("/ner")
def extract_entities(q: Query):
    start = time.perf_counter()
    result = ner.get_entities(q.text)
    duration = time.perf_counter() - start
    return {"entities": result, "time": round(duration, 4)}


In [None]:
!gunicorn -w 8 -k uvicorn.workers.UvicornWorker main:app


/bin/bash: line 1: gunicorn: command not found


In [None]:
from locust import HttpUser, task, between
import random

queries = [
    "buy redmi mobiles under 10000",
    "i want a gaming laptop with 16gb ram",
    "blue addidas shoes",
    "iphone 15 pro max black",
    "can i get nike running shoes in blue",
]

class NerUser(HttpUser):
    wait_time = between(0.01, 0.02)  # simulate rapid requests

    @task
    def ner_request(self):
        query = random.choice(queries)
        self.client.post("/ner", json={"text": query})


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environm

RuntimeError: cannot release un-acquired lock

In [None]:
!pip install locust

It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.


AttributeError: module 'select' has no attribute 'epoll'

It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.
It seems that the gevent monkey-patching is being used.
Please set an environment variable with:
GEVENT_SUPPORT=True
to enable gevent support in the debugger.


In [None]:
!pip install requests



In [1]:
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
import csv

# Sample user queries
queries = [
    "buy redmi mobiles under 10000",
    "i want a gaming laptop with 16gb ram",
    "blue addidas shoes",
    "iphone 15 pro max black",
    "can i get nike running shoes in blue",
]

# ⚙️ Simulated API logic (replace with real logic if needed)
def mock_api(query):
    import time
    base_latency = 0.001
    penalty = min(len(query) / 1000, 0.01)  # Simulate penalty for long queries
    time.sleep(base_latency + penalty)
    return {
        "query": query,
        "status": 200,
        "intent": "search_product",
        "entities": {"brand": "mock_brand"}
    }


# 🧪 Simulate multiple "requests"
def simulate_requests(num_requests=1000000, concurrent_users=100):
    print(f"Simulating {num_requests:,} requests with {concurrent_users} concurrent threads")

    start_time = time()
    response_times = []
    success_count = 0

    def task():
        query = random.choice(queries)
        t0 = time()
        response = mock_api(query)
        t1 = time()
        elapsed = t1 - t0
        return response["status"], elapsed

    with ThreadPoolExecutor(max_workers=concurrent_users) as executor:
        futures = [executor.submit(task) for _ in range(num_requests)]
        for future in as_completed(futures):
            status, elapsed = future.result()
            if status == 200:
                success_count += 1
                response_times.append(elapsed)

    total_time = time() - start_time
    avg_response_time = sum(response_times) / len(response_times)
    throughput = len(response_times) / total_time

    print("\n Load Test Report:")
    print(f"Total requests: {num_requests:,}")
    print(f"Success: {success_count:,}")
    print(f"Total time: {total_time:.2f} sec")
    print(f"Throughput: {throughput:.2f} req/sec")
    print(f"Average response time: {avg_response_time:.4f} sec")


    with open("load_test_report.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Request#", "ResponseTime(sec)"])
        for i, rt in enumerate(response_times):
            writer.writerow([i+1, rt])

    print("📄 Report saved to 'load_test_report.csv'")


simulate_requests(num_requests=1000000, concurrent_users=100)


Simulating 1,000,000 requests with 100 concurrent threads

 Load Test Report:
Total requests: 1,000,000
Success: 1,000,000
Total time: 146.92 sec
Throughput: 6806.44 req/sec
Average response time: 0.0144 sec
📄 Report saved to 'load_test_report.csv'


Auto-suggest re-ranker


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# Paths to your CSV files and where to save the model
USER_QUERIES_CSV    = "user_queries.csv"
SESSION_LOG_CSV     = "session_log.csv"
PRODUCT_CATALOG_CSV = "product_catalog.csv"
REALTIME_CSV        = "realtime_product_info.csv"
NER_CSV             = "ner_training_dataset.csv"  # Added NER dataset
OUTPUT_MODEL_PATH   = "ltr_model.txt"

# Updated feature columns used for training, incorporating all your requests
FEATURE_COLS = [
    'query_frequency',          # Global popularity of the suggestion
    'event_count',              # Clicks for this query in the session log
    'semantic_score',           # Core semantic similarity (query vs. suggestion)
    'brand_match',              # Simple string match of brand in suggestion
    'price_gap_to_avg',         # Price difference from user's clicked average
    'offer_preference_match',   # Matches strong offers with "offer_seeker" persona
    'persona_score',            # Numerical score for the derived user persona
    'avg_price_last_k_clicks',  # User's price affinity from session clicks
    'preferred_brand_match',    # 1 if suggestion matches a user's favorite brand
    'session_length',           # Number of events in the current session
    'query_intent_score',       # Semantic score, boosted by NER brand match
    'product_price',            # Average price of products matching suggestion
    'offer_strength',           # Average offer strength for matched products
    'product_rating',           # Average rating for matched products
    'product_click_count',      # Historical clicks for matched products
    'is_f_assured'              # Whether the product is Flipkart Assured
]

# LightGBM parameters for LambdaRank
LGB_PARAMS = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [1, 3, 5, 10],
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'verbose': -1
}

# 1) Load all raw data
print("📥 Loading all CSVs…")
user_q   = pd.read_csv(USER_QUERIES_CSV)
session  = pd.read_csv(SESSION_LOG_CSV)
products = pd.read_csv(PRODUCT_CATALOG_CSV)
realtime = pd.read_csv(REALTIME_CSV)
ner_data = pd.read_csv(NER_CSV)

# --- Data Cleaning and Pre-computation ---

# Normalize query strings for consistent merging
session['query'] = session['query'].str.lower().str.strip()
user_q['raw_query'] = user_q['raw_query'].str.lower().str.strip()

# Clean and convert query_id in NER data to numeric for merging
print("🧑‍💻 Processing NER data…")
ner_data['query_id'] = pd.to_numeric(ner_data['query_id'].str.replace('Q', ''), errors='coerce').astype('Int64')
ner_data.dropna(subset=['query_id'], inplace=True)

# Precompute embeddings for all unique suggestions for efficiency
print("🧠 Computing embeddings for all unique suggestions…")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
uq_df = (
    user_q[['corrected_query', 'frequency']]
    .drop_duplicates(subset='corrected_query')
    .reset_index(drop=True)
)
uq_df['emb'] = embedder.encode(
    uq_df['corrected_query'].tolist(),
    convert_to_tensor=False,
    show_progress_bar=True
).tolist()

# Build lookup dictionaries for fast feature access
emb_dict  = dict(zip(uq_df['corrected_query'], uq_df['emb']))
freq_dict = user_q.groupby('corrected_query')['frequency'].sum().to_dict()
emb_dim   = len(next(iter(emb_dict.values())))

# 2) Feature Engineering - Pre-computation

# Extract NER-based brand tags to create a query-to-brand mapping
print("🏷️ Extracting NER brand tags…")
ner_brands = ner_data[ner_data['tag'] == 'B-BRAND'][['query_id', 'token']]
# The user_queries.csv from the prompt has a different 'query_id' format than NER, so we merge on query text
ner_brands = ner_brands.merge(user_q[['query_id', 'corrected_query']], on='query_id', how='left')
brand_query_dict = dict(zip(ner_brands['corrected_query'], ner_brands['token']))
print(f"    Found {len(brand_query_dict)} brand-specific queries.")

# Derive persona tags based on session behavior
print("🧑 Deriving user personas from session logs…")
session_agg = session.groupby('session_id').agg(
    click_count=('clicked_product_id', 'count'),
    purchase_count=('purchased', lambda x: (x == 'Yes').sum())
).reset_index()

# Define persona logic
session_agg['persona_tag'] = 'budget_friendly' # Default
session_agg.loc[session_agg['purchase_count'] > 0, 'persona_tag'] = 'offer_seeker'
high_click_threshold = session_agg['click_count'].quantile(0.75)
session_agg.loc[session_agg['click_count'] > high_click_threshold, 'persona_tag'] = 'brand_loyalist'
session = session.merge(session_agg[['session_id', 'persona_tag']], on='session_id', how='left')

# Compute preferred brands for each session
print("❤️ Identifying preferred brands per session…")
clicked_brands = session.dropna(subset=['clicked_product_id']).merge(products, left_on='clicked_product_id', right_on='product_id', how='left')
preferred_brands = clicked_brands.groupby('session_id')['brand'].value_counts().groupby(level=0).head(1).reset_index(name='brand_count')

# 3) Build training examples: positives (clicks) + negatives (random non-clicks)
print("➕➖ Building positive/negative examples…")

# --- MODIFIED LOGIC ---
# We now merge on the assumption that the 'query' in the session log
# is the CANONICAL/CORRECTED query that the user clicked on.
pos = (
    session[session['event'] == 'click']
    .merge(user_q, left_on='query', right_on='corrected_query', how='inner') # The key change is here
    [['query', 'session_id']] # Keep session_id for session features
    .rename(columns={'query': 'suggestion'})
    .assign(label=1)
    .drop_duplicates()
)

# This check remains to ensure the new logic works with your data
if pos.empty:
    raise ValueError("Critical Error: No positive examples could be generated even after matching on 'corrected_query'. Please verify that 'query' values from click events in session_log.csv exist as 'corrected_query' values in user_queries.csv.")

neg_rows = []
all_sugs = list(emb_dict.keys())
for _, row in pos.iterrows():
    q = row['suggestion'] # The query and suggestion are the same for positive examples
    sid = row['session_id']
    clicked_sug = row['suggestion']

    candidates = list(set(all_sugs) - {clicked_sug})
    if not candidates: continue

    # Sample one negative for each positive
    sampled_neg = np.random.choice(candidates, size=1, replace=False)[0]
    # For negative examples, the 'query' is the context, and 'suggestion' is the irrelevant item
    neg_rows.append({'query': q, 'suggestion': sampled_neg, 'session_id': sid, 'label': 0})

neg = pd.DataFrame(neg_rows)

# Combine positive & negative examples and assign group IDs
df = pd.concat([pos, neg], ignore_index=True)
# The original 'query' for positive examples is the suggestion itself.
# For negative examples, we need to fill the 'query' column correctly.
df['query'] = df.groupby('session_id')['suggestion'].transform('first')
df['query_id'] = df.groupby(['query', 'session_id']).ngroup()

# 4) Extract all features for each (query, suggestion, session)
print("🔨 Extracting comprehensive features…")
feat_list = []
brands_set = set(products['brand'].dropna().str.lower())

for _, row in df.iterrows():
    q = str(row['query'])
    sug = str(row['suggestion'])
    sid = row['session_id']
    f = {}

    # --- Session/User Features ---
    session_data = session[session['session_id'] == sid]
    f['session_length'] = len(session_data)
    persona = session_data['persona_tag'].iloc[0] if not session_data.empty else 'budget_friendly'
    f['persona_score'] = {'brand_loyalist': 1.0, 'offer_seeker': 0.5, 'budget_friendly': 0.1}.get(persona, 0.1)

    clicked_prods_in_session = clicked_brands[clicked_brands['session_id'] == sid]
    f['avg_price_last_k_clicks'] = clicked_prods_in_session['price'].mean() if not clicked_prods_in_session.empty else 0.0

    pref_brand_series = preferred_brands[preferred_brands['session_id'] == sid]
    pref_brand = pref_brand_series['brand'].iloc[0].lower() if not pref_brand_series.empty else ''
    f['preferred_brand_match'] = 1 if pref_brand and pref_brand in sug.lower() else 0

    f['query_frequency'] = freq_dict.get(sug, 0)
    f['event_count'] = session[(session['query'] == q) & (session['event'] == 'click')].shape[0]

    # --- Product & Interaction Features ---
    q_emb = embedder.encode([q])
    c_emb = np.array(emb_dict.get(sug, [0]*emb_dim))[None, :]

    # Semantic Score (base)
    f['semantic_score'] = float(cosine_similarity(q_emb, c_emb)[0, 0])

    # Query Intent Score (boosted semantic score)
    f['query_intent_score'] = f['semantic_score']
    if sug in brand_query_dict:
        f['query_intent_score'] *= 1.2 # Boost score if suggestion is a known brand query

    f['brand_match'] = 1 if any(b in sug.lower() for b in brands_set) else 0

    # Find products matching the suggestion text
    matched_products = products[products['title'].str.contains(sug, case=False, na=False)]
    if not matched_products.empty:
        matched_ids = matched_products['product_id']
        realtime_info = realtime[realtime['product_id'].isin(matched_ids)]

        f['product_price'] = matched_products['price'].mean()
        f['is_f_assured'] = 1 if matched_products['is_f_assured'].eq('Yes').any() else 0

        f['offer_strength'] = realtime_info['offer_strength'].map({'Low': 1, 'Medium': 2, 'High': 3}).mean()
        f['product_rating'] = realtime_info['rating'].mean()
        f['product_click_count'] = session[session['clicked_product_id'].isin(matched_ids)].shape[0]
    else:
        # Default values if no product matches the suggestion
        f['product_price'] = 0.0
        f['is_f_assured'] = 0
        f['offer_strength'] = 0.0
        f['product_rating'] = 0.0
        f['product_click_count'] = 0

    f['price_gap_to_avg'] = f['product_price'] - f['avg_price_last_k_clicks']
    f['offer_preference_match'] = 1 if (f['offer_strength'] >= 2 and persona == 'offer_seeker') else 0

    feat_list.append(f)

feat_df = pd.DataFrame(feat_list).fillna(0) # Fill any NaNs with 0
train_df = pd.concat([df[['query_id', 'label']].reset_index(drop=True), feat_df], axis=1)

# 5) Split into train/validation with grouping for LambdaRank
print("📊 Splitting train/validation sets…")
# Ensure all query_id groups have at least 2 samples for stratification
query_counts = train_df['query_id'].value_counts()
valid_queries = query_counts[query_counts >= 2].index
train_df_stratify = train_df[train_df['query_id'].isin(valid_queries)]

tr, va = train_test_split(
    train_df_stratify,
    test_size=0.2,
    random_state=42,
    stratify=train_df_stratify['query_id']
)

group_tr = tr.groupby('query_id').size().to_list()
group_va = va.groupby('query_id').size().to_list()

lgb_tr = lgb.Dataset(tr[FEATURE_COLS], label=tr['label'], group=group_tr)
lgb_va = lgb.Dataset(va[FEATURE_COLS], label=va['label'], group=group_va, reference=lgb_tr)

# 6) Train & save model
print("🚀 Training LightGBM LambdaRank model…")
bst = lgb.train(
    LGB_PARAMS,
    lgb_tr,
    valid_sets=[lgb_tr, lgb_va],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=True),
        lgb.log_evaluation(period=100)
    ]
)

print(f"💾 Saving model to {OUTPUT_MODEL_PATH}")
bst.save_model(OUTPUT_MODEL_PATH)
print("✅ Training complete and model saved successfully!")

📥 Loading all CSVs…
🧑‍💻 Processing NER data…
🧠 Computing embeddings for all unique suggestions…


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

🏷️ Extracting NER brand tags…
    Found 212 brand-specific queries.
🧑 Deriving user personas from session logs…
❤️ Identifying preferred brands per session…
➕➖ Building positive/negative examples…


ValueError: Critical Error: No positive examples could be generated even after matching on 'corrected_query'. Please verify that 'query' values from click events in session_log.csv exist as 'corrected_query' values in user_queries.csv.

In [None]:
# full_train_pipeline.py

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb

# Paths to your CSV files and where to save the model
USER_QUERIES_CSV    = "user_queries.csv"
SESSION_LOG_CSV     = "session_log.csv"
PRODUCT_CATALOG_CSV = "product_catalog.csv"
REALTIME_CSV        = "realtime_product_info.csv"
OUTPUT_MODEL_PATH   = "ltr_model.txt"

# Feature columns used for training
FEATURE_COLS = [
    'frequency',
    'event_count',
    'semantic_score',
    'brand_match',
    'price_gap_to_avg',
    'offer_preference_match'
]

# LightGBM parameters for LambdaRank
LGB_PARAMS = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [1, 3, 5, 10],
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'verbose': -1
}

# 1) Load raw data
print("📥 Loading CSVs…")
user_q   = pd.read_csv(USER_QUERIES_CSV)
session  = pd.read_csv(SESSION_LOG_CSV)
products = pd.read_csv(PRODUCT_CATALOG_CSV)
realtime = pd.read_csv(REALTIME_CSV)

# 2) Precompute embeddings for each unique suggestion
print("🧠 Computing embeddings for all unique suggestions…")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Deduplicate suggestions and encode once
uq_df = (
    user_q[['corrected_query', 'frequency']]
    .drop_duplicates(subset='corrected_query')
    .reset_index(drop=True)
)
uq_df['emb'] = embedder.encode(
    uq_df['corrected_query'].tolist(),
    convert_to_tensor=False
).tolist()
# Build lookup dicts
emb_dict  = dict(zip(uq_df['corrected_query'], uq_df['emb']))
freq_dict = user_q.groupby('corrected_query')['frequency'].sum().to_dict()
emb_dim   = len(next(iter(emb_dict.values())))

# 3) Build training examples: positives (clicks) + negatives (random non-clicks)
print("⚙️ Building positive/negative examples…")
pos = (
    session[session['event'] == 'click']
    .merge(user_q, left_on='query', right_on='raw_query', how='inner')
    [['query', 'corrected_query']]
    .rename(columns={'corrected_query': 'suggestion'})
    .assign(label=1)
)

neg_rows = []
all_sugs  = list(emb_dict.keys())
for q, grp in pos.groupby('query'):
    clicked    = set(grp['suggestion'])
    candidates = list(set(all_sugs) - clicked)
    n          = len(clicked)
    sampled    = np.random.choice(candidates, size=n, replace=False)
    for s in sampled:
        neg_rows.append({'query': q, 'suggestion': s, 'label': 0})
neg = pd.DataFrame(neg_rows)

# Combine positive & negative examples and assign group IDs
df = pd.concat([pos, neg], ignore_index=True)
df['query_id'] = df['query'].factorize()[0]

# 4) Extract features for each (query, suggestion)
print("🔨 Extracting features…")
feat_list = []
brands    = products['brand'].dropna().unique()
for _, row in df.iterrows():
    q   = str(row['query'])
    sug = str(row['suggestion'])
    f   = {}

    # 1) frequency
    f['frequency'] = freq_dict.get(sug, 0)

    # 2) event_count (clicks per query)
    f['event_count'] = (
        session[(session['query'] == q) & (session['event'] == 'click')]
        .shape[0]
    )

    # 3) semantic similarity
    q_emb = embedder.encode([q])
    c_emb = np.array(emb_dict.get(sug, [0]*emb_dim))[None, :]
    f['semantic_score'] = float(cosine_similarity(q_emb, c_emb)[0, 0])

    # 4) brand match
    f['brand_match'] = int(any(b.lower() in sug.lower() for b in brands))

    # 5) price_gap_to_avg
    clicked = (
        session[session['query'] == q]
        .merge(products, left_on='clicked_product_id', right_on='product_id')
    )
    avg_price = clicked['price'].mean() if not clicked.empty else 0.0
    matched   = products[
        products['title'].str.contains(sug, case=False, na=False)
    ]
    sug_price = matched['price'].mean() if not matched.empty else 0.0
    f['price_gap_to_avg'] = float(sug_price - avg_price)

    # 6) offer_preference_match (e.g., checking for strength==2)
    prods = matched['product_id'].tolist()
    off   = realtime[realtime['product_id'].isin(prods)]['offer_strength']
    f['offer_preference_match'] = int((off == 2).any())

    feat_list.append(f)

feat_df  = pd.DataFrame(feat_list)
train_df = pd.concat(
    [df[['query_id', 'label']].reset_index(drop=True), feat_df],
    axis=1
)

# 5) Split into train/validation with grouping for LambdaRank
print("📊 Splitting train/validation…")
tr, va = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['query_id']
)
group_tr = tr.groupby('query_id').size().tolist()
group_va = va.groupby('query_id').size().tolist()

lgb_tr = lgb.Dataset(
    tr[FEATURE_COLS],
    label=tr['label'],
    group=group_tr
)
lgb_va = lgb.Dataset(
    va[FEATURE_COLS],
    label=va['label'],
    group=group_va,
    reference=lgb_tr
)

# 6) Train & save model using callbacks for early stopping
print("🚀 Training LightGBM LambdaRank with callbacks…")
bst = lgb.train(
    LGB_PARAMS,
    lgb_tr,
    valid_sets=[lgb_tr, lgb_va],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

print(f"💾 Saving model to {OUTPUT_MODEL_PATH}")
bst.save_model(OUTPUT_MODEL_PATH)
print("✅ Training complete and model saved.")


📥 Loading CSVs…
🧠 Computing embeddings for all unique suggestions…
⚙️ Building positive/negative examples…
🔨 Extracting features…
📊 Splitting train/validation…


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

!pip install fastapi uvicorn pydantic numpy pandas scikit-learn lightgbm faiss-cpu

In [None]:
# train_ltr_model_final.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# --- Configuration: Set your file paths here ---
PATH_SESSION_LOG = "/content/session_log.csv"
PATH_PRODUCT_CATALOG = "/content/product_catalog.csv"
PATH_REALTIME_INFO = "/content/realtime_product_info.csv"
PATH_USER_QUERY = "/content/user_queries.csv"
OUTPUT_MODEL_PATH = "./lgbm_model.txt"

def train_ranking_model():
    """
    Loads data, engineers features, and trains a LightGBM LTR model.
    """
    print("Step 1: Loading datasets...")
    try:
        # ------------------- FINAL FIX IS HERE -------------------
        # Changed separator to a comma based on your diagnostic output.
        sessions = pd.read_csv(PATH_SESSION_LOG, sep=',')
        # ---------------------------------------------------------

        products = pd.read_csv(PATH_PRODUCT_CATALOG)
        realtime = pd.read_csv(PATH_REALTIME_INFO)
        queries_meta = pd.read_csv(PATH_USER_QUERY)
    except FileNotFoundError as e:
        print(f"❌ ERROR: File not found. Please check your paths. Details: {e}")
        return

    print("Step 2: Preparing data and creating relevance labels...")
    products = pd.merge(products, realtime[['product_id', 'current_price', 'rating', 'review_count']], on='product_id', how='left')
    products['current_price'].fillna(products['price'], inplace=True)

    # This will now work correctly.
    sessions['relevance'] = sessions['purchased'].apply(lambda x: 2 if x == 'Yes' else 1)

    sessions = sessions.rename(columns={'clicked_product_id': 'product_id'})
    training_data = pd.merge(sessions, products, on='product_id')

    print("Step 3: Performing negative sampling...")
    negative_samples = []
    unique_queries = training_data[['session_id', 'query']].drop_duplicates()
    all_product_ids = products['product_id'].unique()
    for _, row in unique_queries.iterrows():
        sid, qry = row['session_id'], row['query']
        clicked_products = set(training_data[(training_data['session_id'] == sid) & (training_data['query'] == qry)]['product_id'])
        available_for_sampling = list(set(all_product_ids) - clicked_products)
        num_samples = min(len(available_for_sampling), 5)
        if num_samples > 0:
            sampled_ids = np.random.choice(available_for_sampling, num_samples, replace=False)
            for pid in sampled_ids:
                negative_samples.append({'session_id': sid, 'query': qry, 'product_id': pid, 'relevance': 0})
    if negative_samples:
        neg_df = pd.DataFrame(negative_samples)
        neg_df = pd.merge(neg_df, products, on='product_id')
        training_data = pd.concat([training_data, neg_df], ignore_index=True)

    print("Step 4: Engineering features...")
    training_data = pd.merge(training_data, queries_meta[['raw_query', 'frequency']], left_on='query', right_on='raw_query', how='left')
    training_data['frequency'].fillna(0, inplace=True)
    avg_price_map = training_data[training_data['relevance'] > 0].groupby('session_id')['current_price'].mean()
    training_data['avg_price_session_clicks'] = training_data['session_id'].map(avg_price_map).fillna(0)
    training_data['brand_match_in_query'] = training_data.apply(lambda row: 1 if str(row.get('brand')).lower() in str(row.get('query')).lower() else 0, axis=1)
    training_data['price_gap_to_avg'] = training_data['current_price'] - training_data['avg_price_session_clicks']
    training_data['review_count'] = np.log1p(training_data['review_count'].fillna(0))
    training_data['frequency'] = np.log1p(training_data['frequency'])

    print("Step 5: Preparing data for LightGBM LTR format...")
    training_data = training_data.sort_values(by='session_id').reset_index(drop=True)
    group_counts = training_data.groupby('session_id').size().to_list()
    feature_columns = [
        'is_f_assured', 'rating', 'review_count', 'current_price', 'frequency',
        'avg_price_session_clicks', 'brand_match_in_query', 'price_gap_to_avg'
    ]
    training_data['is_f_assured'] = (training_data['is_f_assured'] == 'Yes').astype(int)
    X = training_data[feature_columns]
    y = training_data['relevance']

    if X.empty:
        print("❌ ERROR: No data available for training after processing steps.")
        return

    print("Step 6: Training the LTR model...")
    ranker = lgb.LGBMRanker(
        objective="lambdarank", metric="ndcg", n_estimators=500,
        learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1,
    )
    ranker.fit(
        X, y, group=group_counts, eval_set=[(X, y)], eval_group=[group_counts],
        eval_at=[5], callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    print(f"Step 7: Saving the trained model to '{OUTPUT_MODEL_PATH}'...")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)
    print("✅ Model training complete and saved successfully!")

    print("\n--- Learned Feature Importances ---")
    importance_df = pd.DataFrame({
        'feature': feature_columns,
        'importance': ranker.feature_importances_
    }).sort_values('importance', ascending=False)
    print(importance_df)

if __name__ == "__main__":
    train_ranking_model()

Step 1: Loading datasets...
Step 2: Preparing data and creating relevance labels...
Step 3: Performing negative sampling...
Step 4: Engineering features...
Step 5: Preparing data for LightGBM LTR format...
Step 6: Training the LTR model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 879
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 7
Step 7: Saving the trained model to './lgbm_model.txt'...
✅ Model training complete and saved successfully!

--- Learned Feature Importances ---
                    feature  importance
3             current_price         586
2              review_count         436
7          price_gap_to_avg         384
1                    rating         311
5  avg_price_session_clicks         285
0              is_f_assur

In [None]:
# train_ltr_model_advanced_features.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import random

warnings.filterwarnings('ignore')

# --- Configuration: Set your file paths here ---
PATH_SESSION_LOG = "/content/session_log.csv"
PATH_PRODUCT_CATALOG = "/content/product_catalog.csv"
PATH_REALTIME_INFO = "/content/realtime_product_info.csv"
PATH_USER_QUERY = "/content/user_queries.csv"
OUTPUT_MODEL_PATH = "./lgbm_model.txt"

def train_ranking_model():
    """
    Loads data, engineers a comprehensive feature set, and trains a LightGBM LTR model.
    """
    print("Step 1: Loading datasets...")
    try:
        sessions = pd.read_csv(PATH_SESSION_LOG, sep=',')
        products = pd.read_csv(PATH_PRODUCT_CATALOG)
        realtime = pd.read_csv(PATH_REALTIME_INFO)
        queries_meta = pd.read_csv(PATH_USER_QUERY)
    except FileNotFoundError as e:
        print(f"❌ ERROR: File not found. Please check your paths. Details: {e}")
        return

    print("Step 2: Preparing data and creating relevance labels...")

    # --- UPDATED: Merge more real-time info ---
    products = pd.merge(
        products,
        realtime[['product_id', 'current_price', 'rating', 'review_count', 'offer_strength']],
        on='product_id',
        how='left'
    )
    products['current_price'].fillna(products['price'], inplace=True)

    sessions['relevance'] = sessions['purchased'].apply(lambda x: 2 if x == 'Yes' else 1)
    sessions = sessions.rename(columns={'clicked_product_id': 'product_id'})
    training_data = pd.merge(sessions, products, on='product_id')

    # ... (Step 3: Negative Sampling remains the same) ...
    print("Step 3: Performing negative sampling...")
    negative_samples = []
    unique_queries = training_data[['session_id', 'query']].drop_duplicates()
    all_product_ids = products['product_id'].unique()
    for _, row in unique_queries.iterrows():
        sid, qry = row['session_id'], row['query']
        clicked_products = set(training_data[(training_data['session_id'] == sid) & (training_data['query'] == qry)]['product_id'])
        available_for_sampling = list(set(all_product_ids) - clicked_products)
        num_samples = min(len(available_for_sampling), 5)
        if num_samples > 0:
            sampled_ids = np.random.choice(available_for_sampling, num_samples, replace=False)
            for pid in sampled_ids:
                negative_samples.append({'session_id': sid, 'query': qry, 'product_id': pid, 'relevance': 0})
    if negative_samples:
        neg_df = pd.DataFrame(negative_samples)
        neg_df = pd.merge(neg_df, products, on='product_id')
        training_data = pd.concat([training_data, neg_df], ignore_index=True)


    print("Step 4: Engineering advanced features...")
    training_data = pd.merge(training_data, queries_meta[['raw_query', 'frequency']], left_on='query', right_on='raw_query', how='left')
    training_data['frequency'] = np.log1p(training_data['frequency'].fillna(0))

    # --- NEW: Session/User Features ---
    training_data['session_length'] = training_data.groupby('session_id')['session_id'].transform('count')
    avg_price_map = training_data[training_data['relevance'] > 0].groupby('session_id')['current_price'].mean()
    training_data['avg_price_session_clicks'] = training_data['session_id'].map(avg_price_map).fillna(0)

    # --- NEW: Product Features ---
    offer_map = {'Low': 1, 'Medium': 2, 'High': 3}
    training_data['offer_strength_numeric'] = training_data['offer_strength'].map(offer_map).fillna(0)

    # --- NEW: Interaction Features ---
    # Simulate persona_tag for demonstration purposes
    personas = ["brand_loyalist", "budget_friendly", "offer_seeker"]
    session_persona_map = {sid: random.choice(personas) for sid in training_data['session_id'].unique()}
    training_data['persona_tag'] = training_data['session_id'].map(session_persona_map)

    def calculate_offer_match(row):
        if row['persona_tag'] == 'offer_seeker' and row['offer_strength_numeric'] == 3: # High offer
            return 1
        return 0
    training_data['offer_preference_match'] = training_data.apply(calculate_offer_match, axis=1)

    training_data['brand_match_in_query'] = training_data.apply(lambda row: 1 if str(row.get('brand')).lower() in str(row.get('query')).lower() else 0, axis=1)
    training_data['price_gap_to_avg'] = training_data['current_price'] - training_data['avg_price_session_clicks']

    # Feature for category match (from previous step)
    def calculate_category_match(row):
        query_words = set(str(row.get('query')).lower().split())
        category_words = set(str(row.get('category', '')).lower().split())
        if query_words.intersection(category_words): return 1.0
        return 0.0
    training_data['category_match_score'] = training_data.apply(calculate_category_match, axis=1)

    print("Step 5: Preparing data for LightGBM LTR format...")
    training_data = training_data.sort_values(by='session_id').reset_index(drop=True)
    group_counts = training_data.groupby('session_id').size().to_list()

    # --- UPDATED: Final, comprehensive feature list ---
    feature_columns = [
        'is_f_assured', 'rating', 'review_count', 'current_price', 'frequency',
        'avg_price_session_clicks', 'brand_match_in_query', 'price_gap_to_avg',
        'category_match_score', 'session_length', 'offer_strength_numeric',
        'offer_preference_match'
    ]

    training_data['is_f_assured'] = (training_data['is_f_assured'] == 'Yes').astype(int)
    training_data.fillna(0, inplace=True) # Fill any remaining NaNs

    X = training_data[feature_columns]
    y = training_data['relevance']

    if X.empty:
        print("❌ ERROR: No data available for training after processing steps.")
        return

    print("Step 6: Training the LTR model...")
    ranker = lgb.LGBMRanker(
        objective="lambdarank", metric="ndcg", n_estimators=500,
        learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1,
    )
    ranker.fit(
        X, y, group=group_counts, eval_set=[(X, y)], eval_group=[group_counts],
        eval_at=[5], callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    print(f"Step 7: Saving the trained model to '{OUTPUT_MODEL_PATH}'...")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)
    print("✅ Model training complete and saved successfully!")

    print("\n--- Learned Feature Importances ---")
    importance_df = pd.DataFrame({
        'feature': ranker.feature_name_,
        'importance': ranker.feature_importances_
    }).sort_values('importance', ascending=False)
    print(importance_df)

if __name__ == "__main__":
    train_ranking_model()

Step 1: Loading datasets...
Step 2: Preparing data and creating relevance labels...
Step 3: Performing negative sampling...
Step 4: Engineering advanced features...
Step 5: Preparing data for LightGBM LTR format...
❌ ERROR: No data available for training after processing steps.


In [None]:
# train_on_master_dataset.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import random

warnings.filterwarnings('ignore')

# --- Configuration ---
PATH_MASTER_DATA = "./master_training_data.csv"
OUTPUT_MODEL_PATH = "./lgbm_model.txt"

def train_ranking_model():
    """
    Loads the unified master dataset, engineers features, and trains the LTR model.
    """
    # --- Step 1: Load the Master Training Dataset ---
    print("Step 1: Loading the master training dataset...")
    try:
        # Load the single, pre-processed file created by the generation script
        training_data = pd.read_csv(PATH_MASTER_DATA)
    except FileNotFoundError:
        print(f"❌ ERROR: '{PATH_MASTER_DATA}' not found. Please run the generation script first.")
        return

    print("✅ Master dataset loaded successfully.")

    # --- Step 2: Engineer Advanced Features ---
    # The data is already clean, so we can go straight to feature engineering.
    print("Step 2: Engineering advanced features...")

    # a. Session/User Features (grouping by 'query' as a proxy for session)
    training_data['session_length'] = training_data.groupby('query')['query'].transform('count')
    avg_price_map = training_data[training_data['relevance'] > 0].groupby('query')['current_price'].mean()
    training_data['avg_price_session_clicks'] = training_data['query'].map(avg_price_map).fillna(0)

    # b. Product Features
    offer_map = {'Low': 1, 'Medium': 2, 'High': 3}
    training_data['offer_strength_numeric'] = training_data['offer_strength'].map(offer_map).fillna(0)

    # c. Interaction Features
    # Simulate persona_tag for demonstration purposes
    personas = ["brand_loyalist", "budget_friendly", "offer_seeker"]
    query_persona_map = {qry: random.choice(personas) for qry in training_data['query'].unique()}
    training_data['persona_tag'] = training_data['query'].map(query_persona_map)

    def calculate_offer_match(row):
        if row['persona_tag'] == 'offer_seeker' and row['offer_strength_numeric'] == 3: # High offer
            return 1
        return 0
    training_data['offer_preference_match'] = training_data.apply(calculate_offer_match, axis=1)

    training_data['brand_match_in_query'] = training_data.apply(lambda row: 1 if str(row.get('brand')).lower() in str(row.get('query')).lower() else 0, axis=1)
    training_data['price_gap_to_avg'] = training_data['current_price'] - training_data['avg_price_session_clicks']

    def calculate_category_match(row):
        query_words = set(str(row.get('query')).lower().split())
        category_words = set(str(row.get('category', '')).lower().split())
        if query_words.intersection(category_words): return 1.0
        return 0.0
    training_data['category_match_score'] = training_data.apply(calculate_category_match, axis=1)

    # Final data cleaning on engineered features
    training_data.fillna(0, inplace=True)


    # --- Step 3: Prepare Data for LightGBM LTR Format ---
    print("Step 3: Preparing data for LightGBM LTR format...")
    # Sort data by query, which is our grouping key
    training_data = training_data.sort_values(by='query').reset_index(drop=True)

    # Create the 'group' array for LTR
    group_counts = training_data.groupby('query').size().to_list()

    feature_columns = [
        'is_f_assured', 'rating', 'review_count', 'current_price', 'frequency',
        'avg_price_session_clicks', 'brand_match_in_query', 'price_gap_to_avg',
        'category_match_score', 'session_length', 'offer_strength_numeric',
        'offer_preference_match'
    ]

    training_data['is_f_assured'] = (training_data['is_f_assured'] == True).astype(int) # Handle boolean if needed

    X = training_data[feature_columns]
    y = training_data['relevance']

    if X.empty:
        print("❌ ERROR: No data available for training after processing steps.")
        return

    # --- Step 4: Train the LTR Model ---
    print("Step 4: Training the LTR model...")
    ranker = lgb.LGBMRanker(
        objective="lambdarank", metric="ndcg", n_estimators=500,
        learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1,
    )
    ranker.fit(
        X, y, group=group_counts, eval_set=[(X, y)], eval_group=[group_counts],
        eval_at=[5], callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    # --- Step 5: Save the Model and Show Importances ---
    print(f"Step 5: Saving the trained model to '{OUTPUT_MODEL_PATH}'...")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)
    print("✅ Model training complete and saved successfully!")

    print("\n--- Learned Feature Importances ---")
    importance_df = pd.DataFrame({
        'feature': ranker.feature_name_,
        'importance': ranker.feature_importances_
    }).sort_values('importance', ascending=False)
    print(importance_df)

if __name__ == "__main__":
    train_ranking_model()

Step 1: Loading the master training dataset...
❌ ERROR: './master_training_data.csv' not found. Please run the generation script first.


In [None]:
# train_on_final_features.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import random

warnings.filterwarnings('ignore')

# --- Configuration ---
PATH_MASTER_DATA = "/content/master_data.csv"
OUTPUT_MODEL_PATH = "lgbm_model.txt"

def train_ranking_model():
    """
    Loads the unified master dataset, engineers features, and trains the LTR model
    based on the specified feature set.
    """
    # --- Step 1: Load the Master Training Dataset ---
    print("Step 1: Loading the master training dataset...")
    try:
        training_data = pd.read_csv(PATH_MASTER_DATA)
    except FileNotFoundError:
        print(f"❌ ERROR: '{PATH_MASTER_DATA}' not found. Please run the data generation script first.")
        return

    print(f"✅ Master dataset with {len(training_data)} rows loaded successfully.")

    # --- Step 2: Engineer Advanced Features ---
    print("Step 2: Engineering features from the master file...")

    # a. Group-based features
    training_data['session_length'] = training_data.groupby('query')['query'].transform('count')
    avg_price_map = training_data[training_data['relevance'] > 0].groupby('query')['current_price'].mean()
    training_data['avg_price_session_clicks'] = training_data['query'].map(avg_price_map).fillna(0)

    # b. Convert categorical features to numeric
    offer_map = {'Low': 1, 'Medium': 2, 'High': 3}
    training_data['offer_strength_numeric'] = training_data['offer_strength'].map(offer_map).fillna(0)

    # c. Interaction Features
    personas = ["brand_loyalist", "budget_friendly", "offer_seeker"]
    query_persona_map = {qry: random.choice(personas) for qry in training_data['query'].unique()}
    training_data['persona_tag'] = training_data['query'].map(query_persona_map)

    training_data['offer_preference_match'] = training_data.apply(
        lambda row: 1 if row['persona_tag'] == 'offer_seeker' and row['offer_strength_numeric'] == 3 else 0,
        axis=1
    )

    training_data['price_gap_to_avg'] = training_data['current_price'] - training_data['avg_price_session_clicks']
    training_data.fillna(0, inplace=True)

    # --- Step 3: Prepare Data for LightGBM LTR Format ---
    print("Step 3: Preparing data for LightGBM LTR format...")
    training_data = training_data.sort_values(by='query').reset_index(drop=True)
    group_counts = training_data.groupby('query').size().to_list()

    # --- UPDATED: Final feature list to match your image ---
    feature_columns = [
        # User/Session Features
        'avg_price_session_clicks', # Corresponds to avg_price_last_k_clicks
        'session_length',
        # Product Features
        'current_price',            # Corresponds to price
        'offer_strength_numeric',   # Corresponds to offer_strength
        'rating',
        'is_f_assured',
        # Interaction Features
        'ner_brand_match',          # Corresponds to brand_match
        'price_gap_to_avg',
        'offer_preference_match',
        'semantic_similarity'
    ]

    training_data['is_f_assured'] = training_data['is_f_assured'].astype(int)
    X = training_data[feature_columns]
    y = training_data['relevance']

    # --- Step 4: Train the LTR Model ---
    print("Step 4: Training the LTR model...")
    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
    )
    ranker.fit(
        X, y, group=group_counts,
        eval_set=[(X, y)],
        eval_group=[group_counts],
        eval_at=[5],
        callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    # --- Step 5: Save the Model and Show Importances ---
    print(f"Step 5: Saving the trained model to '{OUTPUT_MODEL_PATH}'...")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)
    print("✅ Model training complete and saved successfully!")

    print("\n--- Learned Feature Importances ---")
    importance_df = pd.DataFrame({
        'feature': ranker.feature_name_,
        'importance': ranker.feature_importances_
    }).sort_values('importance', ascending=False)
    print(importance_df)

if __name__ == "__main__":
    train_ranking_model()

Step 1: Loading the master training dataset...
✅ Master dataset with 10000 rows loaded successfully.
Step 2: Engineering features from the master file...
Step 3: Preparing data for LightGBM LTR format...


KeyError: "['ner_brand_match', 'semantic_similarity'] not in index"

In [None]:
# test_live_model.py
import pandas as pd
import numpy as np
import lightgbm as lgb
import re
import random
import time
from typing import List, Dict

class LiveSearchRanker:
    """
    Loads the trained LTR model and uses it to rank products for a live search query.
    """
    def __init__(self, model_path: str, product_catalog_path: str, realtime_info_path: str):
        print("🚀 Initializing Live Search Ranker...")
        # Load the "brain" you trained
        self.model = lgb.Booster(model_file=model_path)

        # --- FIX IS HERE ---
        # It's a method call `feature_name()`, not an attribute `feature_name_`
        self.feature_names = self.model.feature_name()
        # -------------------

        print(f"✅ LTR model loaded from '{model_path}'.")

        # Load data needed for feature engineering
        self.products_df = pd.read_csv(product_catalog_path)
        realtime_df = pd.read_csv(realtime_info_path)

        # Pre-process and merge all necessary real-time info
        self.products_df = pd.merge(
            self.products_df,
            realtime_df[['product_id', 'current_price', 'rating', 'review_count', 'offer_strength']],
            on='product_id',
            how='left'
        )
        self.products_df['current_price'].fillna(self.products_df['price'], inplace=True)
        self.products_df.fillna(0, inplace=True)
        print("✅ Product catalog and real-time data loaded.")

    def get_candidates(self, query: str) -> List[Dict]:
        """Simulates retrieving the top 100 candidates from FAISS/search."""
        query_keywords = set(re.findall(r'\w+', query.lower()))
        mask = self.products_df[['title', 'brand', 'category', 'subcategory']].apply(
            lambda col: col.str.contains('|'.join(query_keywords), case=False, na=False)
        ).any(axis=1)
        relevant_products = self.products_df[mask]

        # Take up to 100 relevant candidates
        candidates_df = relevant_products.head(100)
        return candidates_df.to_dict('records')

    def rank(self, query: str, session: Dict, context: Dict):
        """Executes the full ranking pipeline."""
        candidates = self.get_candidates(query)
        if not candidates:
            return [], {}

        feature_list = []
        for product in candidates:
            # Generate features for each candidate on the fly
            features = self._extract_features(query, product, session, context)
            feature_list.append(features)

        features_df = pd.DataFrame(feature_list)[self.feature_names]
        scores = self.model.predict(features_df)

        for i, product in enumerate(candidates):
            product['ranking_score'] = scores[i]

        ranked_results = sorted(candidates, key=lambda p: p['ranking_score'], reverse=True)
        return ranked_results

    def _extract_features(self, query: str, product: dict, session: dict, context: dict) -> dict:
        """Calculates features for a single product, matching the training script."""
        features = {}

        features['is_f_assured'] = 1 if product.get("is_f_assured", False) else 0
        features['rating'] = product.get("rating", 0)
        features['review_count'] = np.log1p(product.get("review_count", 0))
        features['current_price'] = product.get("current_price", 0)
        features['frequency'] = np.log1p(context.get("query_frequency", 0))
        features['session_length'] = session.get("session_length", 1)

        last_k_prices = session.get("last_k_click_prices", [])
        features['avg_price_session_clicks'] = np.mean(last_k_prices) if last_k_prices else 0

        offer_map = {'Low': 1, 'Medium': 2, 'High': 3}
        features['offer_strength_numeric'] = offer_map.get(product.get('offer_strength'), 0)

        persona = context.get('persona_tag', 'budget_friendly')
        features['offer_preference_match'] = 1 if persona == 'offer_seeker' and features['offer_strength_numeric'] == 3 else 0

        features['brand_match_in_query'] = 1 if str(product.get("brand", "")).lower() in query.lower() else 0
        features['price_gap_to_avg'] = features['current_price'] - features['avg_price_session_clicks']

        query_words = set(query.lower().split())
        category_words = set(str(product.get('category', '')).lower().split())
        features['category_match_score'] = 1.0 if query_words.intersection(category_words) else 0.0

        return features

# ===================================================================
# --- Main execution block to run a live search test ---
# ===================================================================
if __name__ == "__main__":
    # --- Configuration ---
    PRODUCT_CATALOG_PATH = "/content/product_catalog.csv"
    REALTIME_INFO_PATH = "/content/realtime_product_info.csv"
    MODEL_PATH = "./lgbm_model.txt"

    live_ranker = LiveSearchRanker(
        model_path=MODEL_PATH,
        product_catalog_path=PRODUCT_CATALOG_PATH,
        realtime_info_path=REALTIME_INFO_PATH
    )

    # --- List of queries to test ---
    queries_to_test = [
        "watches for women"
    ]

    # --- Simulate a user context for the test ---
    mock_session = {
        "session_length": 12,
        "last_k_click_prices": [12000, 15000, 13500]
    }
    mock_context = {
        "query_frequency": 75,
        "persona_tag": "budget_friendly"
    }

    # Loop through and test each query
    for query in queries_to_test:
        print(f"\n{'='*25}\nExecuting Live Search for: '{query}'\n{'='*25}")

        start_time = time.perf_counter()
        final_ranked_list = live_ranker.rank(query, mock_session, mock_context)
        end_time = time.perf_counter()

        print(f"\n--- ✅ Final Top 5 Ranked Products (Time: {end_time - start_time:.4f}s) ---")
        if final_ranked_list:
            for rank, product in enumerate(final_ranked_list[:5], 1):
                print(f"  #{rank}: {product.get('title')} (Score: {product['ranking_score']:.4f})")
        else:
            print("  No relevant products found.")

🚀 Initializing Live Search Ranker...
✅ LTR model loaded from './lgbm_model.txt'.
✅ Product catalog and real-time data loaded.

Executing Live Search for: 'watches for women'


KeyError: "['persona_tag', 'avg_price_last_k_clicks', 'preferred_brands_count', 'query_frequency', 'price', 'brand', 'click_count', 'brand_match', 'semantic_similarity'] not in index"

In [None]:
import pandas as pd
import numpy as np
import random
import json

random.seed(42)
np.random.seed(42)

# --- CONFIGURE ---
NUM_USERS = 300   # Unique users/sessions
SESSIONS_PER_USER = 2
QUERIES_PER_SESSION = 2
PRODUCTS_PER_QUERY = 10  # Candidates per query

persona_types = ["brand_loyalist", "budget_friendly", "offer_seeker"]
brands = ["Nike", "Samsung", "Puma", "Alisha", "Mi", "Levi's"]
categories = ["Clothing", "Electronics", "Shoes", "Accessories", "Home"]
offer_strengths = ["Low", "Medium", "High"]
event_types = ["clicked", "viewed", "add_to_cart"]
queries = ["red shoes", "wireless headphones", "laptop sleeve", "cotton kurti", "puma t-shirt"]

def random_brand(query):
    # Brand match 40% if brand in query, else random
    for b in brands:
        if b.lower() in query.lower():
            return b
    return random.choice(brands)

def random_intent_vector():
    # 128-dim float vector as string, normalization so norm=1
    vec = np.random.randn(128)
    vec /= np.linalg.norm(vec)
    return json.dumps(vec.tolist())

def random_title(brand, prodcat):
    words = [brand, random.choice(["Pro", "Max", "Classic", "Ultra"]), prodcat]
    return " ".join(words)

def random_tags():
    options = [
        "lightweight", "portable", "fashion", "durable", "eco-friendly",
        "wireless", "summer", "trendy", "stylish"
    ]
    return ", ".join(random.sample(options, random.randint(2, 5)))

def semantic_similarity(query, title):
    # Simulate similar titles giving higher similarity to query
    return round(0.9 if any(word in title.lower() for word in query.lower().split())
                 else random.uniform(0.3, 0.8), 3)

# For counters/frequency
query_freq = {q: random.randint(50, 300) for q in queries}
user_brand_pref = {}

rows = []
for user_id in range(NUM_USERS):
    persona = random.choice(persona_types)
    # User brand affinity (for preferred_brands stat)
    pref_brands = list(np.random.choice(brands, random.randint(1,3), replace=False))
    user_brand_pref[user_id] = pref_brands
    for session in range(SESSIONS_PER_USER):
        # Simulate a history of K clicks/prices for avg calc
        session_click_prices = [random.uniform(200, 20000) for _ in range(random.randint(1,6))]
        for q in range(QUERIES_PER_SESSION):
            query = random.choice(queries)
            query_intent = random_intent_vector()
            session_length = random.randint(2, 12)
            event = random.choice(event_types)
            q_freq = query_freq[query]
            for prod_position in range(PRODUCTS_PER_QUERY):
                brand = random_brand(query)
                price = random.randint(250, 30000)
                offer_strength = random.choice(offer_strengths)
                is_f_assured = random.choice([0, 1])
                rating = round(random.uniform(1.7, 4.9), 2)
                # PRODUCT feature: count how often clicked (simulate)
                click_count = random.randint(0, 3000)
                # PRODUCT title/title sim
                prodcat = random.choice(categories)
                title = random_title(brand, prodcat)
                tags = random_tags()
                review_count = random.randint(0, 6000)
                # INTERACTION
                brand_match = int(brand.lower() in query.lower())
                # OFFER preference match: persona logic
                offer_map = {"Low":1, "Medium":2, "High":3}
                offer_strength_num = offer_map[offer_strength]
                offer_preference_match = int((persona == "offer_seeker") and (offer_strength_num == 3))
                price_gap_to_avg = price - np.mean(session_click_prices)

                # avg_price_last_k_clicks
                avg_price_last_k = np.mean(session_click_prices)
                preferred_brands = ",".join(pref_brands)
                # RELEVANCE: simulate as 2 (high) if clicked/add_to_cart/brand match, else 0/1
                if event == "clicked" or (brand_match and event in ["clicked", "add_to_cart"]):
                    relevance = random.choices([2,3], weights=[0.7,0.3])[0]
                else:
                    relevance = random.choices([0,1], weights=[0.8,0.2])[0]
                # Historical product price for session (could add randomness)
                row = dict(
                    user_id = user_id,
                    session_id = f"{user_id}-{session}",
                    query = query,
                    persona_tag = persona,
                    avg_price_last_k_clicks = round(avg_price_last_k,2),
                    preferred_brands = preferred_brands,
                    session_length = session_length,
                    query_intent_vector = query_intent,
                    event = event,
                    query_frequency = q_freq,

                    product_id = f"prod-{random.randint(100000,999999)}",
                    price = price,
                    brand = brand,
                    offer_strength = offer_strength,
                    offer_strength_numeric = offer_strength_num,
                    rating = rating,
                    click_count = click_count,
                    is_f_assured = is_f_assured,

                    brand_match = brand_match,
                    price_gap_to_avg = round(price_gap_to_avg,2),
                    offer_preference_match = offer_preference_match,
                    semantic_similarity = semantic_similarity(query, title),

                    # extra useful product features
                    title = title,
                    category = prodcat,
                    subcategory = prodcat,  # for demo
                    description = f"{prodcat} by {brand}. Great for {query}",
                    specifications = json.dumps([{"key":"Material","value":"Cotton"}]),
                    color = random.choice(["Red", "Blue", "Black", "Green"]),
                    cod_available = random.choice([0,1]),
                    return_policy = random.choice(["30-day","15-day","No Return"]),
                    seller_name = random.choice(["GlobalMart", "Flipkart", "TechMall"]),
                    seller_rating = round(random.uniform(3.4,5.0),1),
                    image_url = f"http://img.fake.com/product/{brand}_{prodcat}_{random.randint(111,999)}.jpg",
                    tags = tags,
                    current_price = price - random.randint(0,70),
                    review_count = review_count,
                    frequency = random.randint(1, 10000),
                    relevance = relevance,
                )
                rows.append(row)

df = pd.DataFrame(rows)
df.to_csv("master_data.csv", index=False)
print(f"✅ Wrote master_data.csv with {len(df)} rows and {len(df.columns)} columns.\nSample:")
print(df.head().T)


✅ Wrote master_data.csv with 12000 rows and 38 columns.
Sample:
                                                                         0  \
user_id                                                                  0   
session_id                                                             0-0   
query                                                         puma t-shirt   
persona_tag                                                 brand_loyalist   
avg_price_last_k_clicks                                           14190.39   
preferred_brands                                                      Nike   
session_length                                                           3   
query_intent_vector      [-0.0988951702455008, 0.028364466037864557, 0....   
event                                                          add_to_cart   
query_frequency                                                        120   
product_id                                                     prod-948749   


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json
import warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

PATH_MASTER_DATA = "/content/master_data_with_embeddings.csv"
OUTPUT_MODEL_PATH = "lgbm_model.txt"

# Map offer_strength strings to numeric
OFFER_STRENGTH_MAP = {'Low': 1, 'Medium': 2, 'High': 3}

def parse_embedding_column(embedding_series):
    """
    Parses string-encoded list-of-floats column into numpy 2d array.
    Assumes each row is a JSON list string, e.g. "[0.1, 0.2, ...]" or already list.
    Returns DataFrame of the embeddings with each dimension as a column (embedding_dim features).
    """
    def to_list(x):
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return json.loads(x)
            except json.JSONDecodeError:
                # fallback: strip and split
                return [float(i) for i in x.strip('[]').split(',') if i.strip()]
        elif isinstance(x, list):
            return x
        else:
            return []

    embeddings = embedding_series.apply(to_list)
    # Determine embedding dimension from first non-empty list
    embedding_dim = 0
    for emb in embeddings:
        if isinstance(emb, list) and len(emb) > 0:
            embedding_dim = len(emb)
            break
    if embedding_dim == 0:
        # no embeddings found
        return pd.DataFrame(np.zeros((len(embedding_series), 1)), columns=['embedding_zero'])
    # Pad or truncate each embedding to embedding_dim
    emb_matrix = np.zeros((len(embedding_series), embedding_dim))
    for i, emb in enumerate(embeddings):
        if len(emb) == embedding_dim:
            emb_matrix[i] = emb
        elif len(emb) < embedding_dim:
            emb_matrix[i, :len(emb)] = emb
        else:
            emb_matrix[i] = emb[:embedding_dim]
    emb_df = pd.DataFrame(emb_matrix, columns=[f'emb_dim_{i}' for i in range(embedding_dim)])
    return emb_df

def prepare_features(df):
    # --- Encode categorical features ---
    # persona_tag and brand
    for col in ['persona_tag', 'brand', 'event']:
        if col in df.columns:
            df[col], _ = pd.factorize(df[col])
        else:
            df[col] = 0  # default if missing

    # preferred_brands: count how many brands the user prefers (comma separated)
    if 'preferred_brands' in df.columns:
        df['preferred_brands_count'] = df['preferred_brands'].fillna("").apply(lambda x: len(str(x).split(',')) if x else 0)
    else:
        df['preferred_brands_count'] = 0

    # Map offer_strength to numeric
    if 'offer_strength' in df.columns:
        df['offer_strength_numeric'] = df['offer_strength'].map(OFFER_STRENGTH_MAP).fillna(0).astype(int)
    else:
        df['offer_strength_numeric'] = 0

    # Convert boolean flags -> int
    for col in ['is_f_assured', 'cod_available']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.lower().map({'true':1, 'false':0, '0':0, '1':1, 'yes':1, 'no':0}).fillna(0).astype(int)
        else:
            df[col] = 0

    # Numeric columns: convert and fillna
    numeric_cols = [
        'avg_price_last_k_clicks',
        'session_length',
        'query_frequency',
        'price',
        'rating',
        'click_count',
        'brand_match',
        'price_gap_to_avg',
        'offer_preference_match',
        'semantic_similarity',
        'review_count',
        'seller_rating',
        'current_price',
        'frequency',
        'offer_strength_numeric',
        'is_f_assured',
        'cod_available'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            df[col] = 0

    # --- Handle embedding fields ---

    # query_intent_vector embedding => reduce to mean of components (or alternative aggregate)
    if 'query_intent_vector' in df.columns:
        emb_df = parse_embedding_column(df['query_intent_vector'])
        # append embedding column means as a single float feature (mean vector value)
        df['query_intent_vector_mean'] = emb_df.mean(axis=1)
    else:
        df['query_intent_vector_mean'] = 0

    # product_embedding optional, can do same aggregation or skip
    if 'product_embedding' in df.columns:
        emb_df = parse_embedding_column(df['product_embedding'])
        df['product_embedding_mean'] = emb_df.mean(axis=1)
    else:
        df['product_embedding_mean'] = 0

    # query_intent_similarity: numeric float
    if 'query_intent_similarity' in df.columns:
        df['query_intent_similarity'] = pd.to_numeric(df['query_intent_similarity'], errors='coerce').fillna(0)
    else:
        df['query_intent_similarity'] = 0

    return df

def train_ranking_model():
    print("Loading dataset...")
    try:
        df = pd.read_csv(PATH_MASTER_DATA)
    except FileNotFoundError:
        print(f"❌ ERROR: '{PATH_MASTER_DATA}' not found.")
        return

    print(f"Loaded dataset: {len(df)} rows, {len(df.columns)} columns")

    df = prepare_features(df)

    # Features to use for training
    # Note: Added 'query_intent_vector_mean' and 'query_intent_similarity' per your dataset
    # Also added 'event' encoded and product_embedding_mean
    feature_columns = [
        # Session/User Features
        'persona_tag',
        'avg_price_last_k_clicks',
        'preferred_brands_count',
        'session_length',
        'query_intent_vector_mean',
        'event',
        'query_frequency',

        # Product Features
        'price',
        'brand',
        'offer_strength_numeric',
        'rating',
        'click_count',
        'is_f_assured',

        # Interaction Features
        'brand_match',
        'price_gap_to_avg',
        'offer_preference_match',
        'semantic_similarity',

        # Extras from embeddings
        'query_intent_similarity',
        'product_embedding_mean',
    ]

    missing_feats = [f for f in feature_columns if f not in df.columns]
    if missing_feats:
        print("❌ Missing features from dataset:", missing_feats)
        return

    X = df[feature_columns]
    y = df['relevance'].astype(int)

    # Grouping for LTR by user_id + session_id + query
    if not all(c in df.columns for c in ['user_id', 'session_id', 'query']):
        print("❌ ERROR: 'user_id', 'session_id', or 'query' columns are missing in the dataset.")
        return

    groups = df.groupby(['user_id', 'session_id', 'query']).size().to_list()
    print(f"Number of groups: {len(groups)}; total rows: {len(df)}")

    print("Training LightGBM Ranker...")
    ranker = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
    )

    ranker.fit(
        X, y,
        group=groups,
        eval_set=[(X, y)],
        eval_group=[groups],
        eval_at=[5],
        callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    print(f"Saving model to '{OUTPUT_MODEL_PATH}'")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)

    print("\nFeature importances:")
    feat_imp = pd.DataFrame({
        'feature': ranker.feature_name_,
        'importance': ranker.feature_importances_
    }).sort_values(by='importance', ascending=False)
    print(feat_imp)

if __name__ == "__main__":
    train_ranking_model()


Loading dataset...
Loaded dataset: 12000 rows, 40 columns
Number of groups: 1077; total rows: 12000
Training LightGBM Ranker...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2246
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 19
Saving model to 'lgbm_model.txt'

Feature importances:
                     feature  importance
10                    rating        1629
11               click_count        1600
17   query_intent_similarity        1513
7                      price        1394
18    product_embedding_mean        1220
16       semantic_similarity        1201
14          price_gap_to_avg        1189
4   query_intent_vector_mean        1172
1    avg_price_last_k_clicks        1015
5                      event         806
3            

In [None]:
import pandas as pd
import lightgbm as lgb

# Paths (adjust if needed)
MODEL_PATH = "lgbm_model.txt"
DATA_PATH = "/content/master_data_with_embeddings.csv"

# Features used in the model, please keep consistent with your training script
feature_columns = [
    'persona_tag',
    'avg_price_last_k_clicks',
    'preferred_brands_count',
    'session_length',
    'query_frequency',
    'price',
    'brand',
    'offer_strength_numeric',
    'rating',
    'click_count',
    'is_f_assured',
    'brand_match',
    'price_gap_to_avg',
    'offer_preference_match',
    'semantic_similarity',
    'query_intent_vector_mean',          # included if you used embedding mean feature
    'event',                            # encoded event info
    'query_intent_similarity',
    'product_embedding_mean'
]

def encode_categorical_features(df):
    for col in ['persona_tag', 'brand', 'event']:
        if col in df.columns:
            df[col], _ = pd.factorize(df[col])
        else:
            df[col] = 0
    return df

def prepare_features(df):
    df['preferred_brands_count'] = df['preferred_brands'].fillna("").apply(lambda x: len(str(x).split(',')) if x else 0)

    # Ensure numeric columns are numeric and fill missing with 0
    for col in feature_columns:
        if col not in df.columns:
            raise KeyError(f"Missing feature column: {col}")
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    return df

def test_user_query(query_str, user_id=None, session_id=None, top_k=10):
    # Load dataset
    data = pd.read_csv(DATA_PATH)

    # Filter data for the given query, optionally filter user_id/session_id
    if user_id is not None and session_id is not None:
        df_filtered = data[
            (data['query'] == query_str) &
            (data['user_id'] == user_id) &
            (data['session_id'] == session_id)
        ].copy()
    else:
        # Just filter by query string across all users/sessions
        df_filtered = data[data['query'] == query_str].copy()

    if df_filtered.empty:
        print(f"No data found for query: '{query_str}'")
        return

    # Encode categoricals as in training
    df_filtered = encode_categorical_features(df_filtered)

    # Prepare features
    df_filtered = prepare_features(df_filtered)

    # Extract feature matrix
    X_test = df_filtered[feature_columns]

    # Load LightGBM model
    booster = lgb.Booster(model_file=MODEL_PATH)

    # Predict scores
    scores = booster.predict(X_test)
    df_filtered['score'] = scores

    # Rank products descending
    df_filtered['rank'] = df_filtered['score'].rank(method='first', ascending=False)

    # Show top K products
    top_products = df_filtered.sort_values('rank').head(top_k)

    # Define columns to display — adjust as needed
    display_cols = ['product_id', 'title', 'brand', 'price', 'score', 'rank']
    print(f"\nTop {top_k} products for user query: '{query_str}'\n")
    print(top_products[display_cols].to_string(index=False))

if __name__ == "__main__":
    # Example: input any query you want to test
    user_query = input("Enter user query to test top 10 products: ").strip()
    test_user_query(user_query)


Enter user query to test top 10 products: watches for women
No data found for query: 'watches for women'


In [None]:
import pandas as pd
import numpy as np
import random
import json
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

random.seed(42)
np.random.seed(42)

# --- CONFIGURATION ---
NUM_USERS = 1000              # Number of users/sessions
SESSIONS_PER_USER = 2
QUERIES_PER_SESSION = 2
PRODUCTS_PER_QUERY = 20       # Candidates per query

EMBEDDING_DIM = 384           # Use consistent embedding dim

# Predefined categorical values (expand as per Flipkart catalog)
persona_tags = ["brand_loyalist", "budget_friendly", "offer_seeker"]
brands = ["Nike", "Samsung", "Puma", "Alisha", "Mi", "Levi's",
          "Sony", "Apple", "Adidas", "Reebok", "Dell", "HP", "Fossil"]
categories = ["Clothing", "Electronics", "Footwear", "Accessories", "Home", "Sports"]
subcategories = {
    "Clothing": ["Men's Clothing", "Women's Clothing", "Kids' Clothing"],
    "Electronics": ["Mobiles", "Laptops", "Headphones", "Cameras"],
    "Footwear": ["Sports Shoes", "Formal Shoes", "Sandals"],
    "Accessories": ["Watches", "Sunglasses", "Handbags"],
    "Home": ["Kitchen Appliances", "Furniture", "Decor"],
    "Sports": ["Sportswear", "Fitness Equipment"]
}
offer_strengths = ["Low", "Medium", "High"]
events = ["clicked", "viewed", "add_to_cart"]
return_policies = ["30-day return", "15-day return", "No Return"]
colors = ["Red", "Blue", "Black", "Green", "White", "Yellow", "Pink", "Silver", "Navy"]

queries = [  # Expand or generate realistic queries
    "red shoes", "wireless headphones", "laptop sleeve", "cotton kurti",
    "puma t-shirt", "smartphone under 15000", "men's watches", "kids sneakers",
    "gaming laptop", "bluetooth speaker", "running shoes", "formal shirt",
    "wireless mouse", "fitness tracker", "women's handbags", "kitchen mixer",
    "office chair", "LED TV", "digital camera", "summer dress"
]

# Initialize SentenceTransformer once (avoid multiple init)
model_name = 'all-MiniLM-L6-v2'
print(f"Loading embedding model '{model_name}' once ...")
embed_model = SentenceTransformer(model_name)

def generate_embedding(texts, target_dim=EMBEDDING_DIM):
    # Use model to generate embeddings, with batch encoding for speed
    embeddings = embed_model.encode(texts, show_progress_bar=False)
    # If model embedding dim differs from target, reduce
    if embeddings.shape[1] != target_dim:
        from sklearn.decomposition import PCA
        pca = PCA(n_components=target_dim)
        embeddings = pca.fit_transform(embeddings)
    return embeddings

def generate_query_intent_vector(query):
    # For simplicity, embed the query string similarly
    embedding = embed_model.encode([query], show_progress_bar=False)[0]
    # Return list for JSON storage
    return embedding.tolist()

def generate_product_title(brand, category):
    descriptors = ["Pro", "Max", "Classic", "Ultra", "New", "Eco", "Sport", "Lite"]
    return f"{brand} {random.choice(descriptors)} {category}"

def generate_specifications(category):
    spec_samples = {
        "Material": ["Cotton", "Leather", "Plastic", "Metal", "Wood"],
        "Warranty": ["1 year", "2 years", "No warranty", "6 months"],
        "Power": [f"{x}W" for x in [50, 100, 200, 300, 500]],
        "Pattern": ["Solid", "Printed", "Striped", "Checked"],
        "Battery": ["Yes", "No"],
    }
    chosen_specs = []
    keys = list(spec_samples.keys())
    num_specs = random.randint(2, 4)
    random.shuffle(keys)
    for k in keys[:num_specs]:
        v = random.choice(spec_samples[k])
        chosen_specs.append({"key": k, "value": v})
    return json.dumps(chosen_specs)

def map_offer_strength(offer):
    return {"Low": 1, "Medium": 2, "High": 3}.get(offer, 0)

def generate_user_brand_preferences():
    # Return a comma-separated string of 1-3 favorite brands
    count = random.randint(1, 3)
    return ",".join(random.sample(brands, count))

def generate_row(user_id, session_id, query, persona_tag, preferred_brands, product_i):
    # Choose brand based on persona sometimes for realism
    if persona_tag == "brand_loyalist" and random.random() < 0.7:
        brand = random.choice(preferred_brands.split(","))
    else:
        brand = random.choice(brands)

    category = random.choice(categories)
    subcategory = random.choice(subcategories[category])

    title = generate_product_title(brand, category)
    price = round(random.uniform(200, 60000), 2)
    discount = random.uniform(0, 0.4)
    current_price = round(price * (1 - discount), 2)
    rating = round(random.uniform(1.0, 5.0), 2)
    review_count = random.randint(0, 5000)
    is_f_assured = random.choice([0, 1])
    cod_available = random.choice([0, 1])
    return_policy = random.choice(return_policies)
    seller_name = random.choice(["Flipkart Retail", "GlobalMart", "FashionHub", "TechWorld", "ShoePalace"])
    seller_rating = round(random.uniform(3.0, 5.0), 2)
    offer_strength = random.choice(offer_strengths)
    offer_strength_numeric = map_offer_strength(offer_strength)
    click_count = random.randint(0, 5000)
    session_length = random.randint(1, 15)
    event = random.choice(events)
    query_frequency = random.randint(10, 500)
    brand_match = int(brand.lower() in query.lower())
    avg_price_last_k_clicks = round(random.uniform(100, 20000), 2)
    price_gap_to_avg = round(current_price - avg_price_last_k_clicks, 2)
    offer_preference_match = int((persona_tag == "offer_seeker") and (offer_strength == "High"))
    color = random.choice(colors)
    frequency = random.randint(1, 10000)

    # query_intent_vector, product_embedding (embeddings)
    query_intent_vector = generate_query_intent_vector(query)

    product_emb_raw = embed_model.encode([title], show_progress_bar=False)[0]

    # Normalize embeddings for cosine similarity calculation
    q_vec_norm = normalize(np.array(query_intent_vector).reshape(1, -1))[0]
    p_vec_norm = normalize(product_emb_raw.reshape(1, -1))[0]
    # Cosine similarity
    semantic_similarity = float(np.dot(q_vec_norm, p_vec_norm))

    # Randomly generate product_id unique per product
    product_id = f"prod-{user_id}-{session_id}-{product_i}-{random.randint(1000,9999)}"

    # Compose the row dictionary exactly matching your columns & formats
    row = {
        "user_id": user_id,
        "session_id": f"{user_id}-{session_id}",
        "query": query,
        "persona_tag": persona_tag,
        "avg_price_last_k_clicks": avg_price_last_k_clicks,
        "preferred_brands": preferred_brands,
        "session_length": session_length,
        "query_intent_vector": json.dumps(query_intent_vector),
        "event": event,
        "query_frequency": query_frequency,

        "product_id": product_id,
        "price": price,
        "brand": brand,
        "offer_strength": offer_strength,
        "offer_strength_numeric": offer_strength_numeric,
        "rating": rating,
        "click_count": click_count,
        "is_f_assured": is_f_assured,

        "brand_match": brand_match,
        "price_gap_to_avg": price_gap_to_avg,
        "offer_preference_match": offer_preference_match,
        "semantic_similarity": semantic_similarity,

        "title": title,
        "category": category,
        "subcategory": subcategory,
        "description": f"{category} item by {brand} for {query}",
        "specifications": generate_specifications(category),
        "color": color,
        "cod_available": cod_available,
        "return_policy": return_policy,
        "seller_name": seller_name,
        "seller_rating": seller_rating,
        "image_url": f"http://img.fake.com/product/{product_id}.jpg",
        "tags": ", ".join(random.sample(["lightweight", "portable", "stylish", "durable", "eco-friendly", "wireless", "trendy", "fashion", "sports"], 3)),
        "current_price": current_price,
        "review_count": review_count,
        "frequency": frequency,

        # Embeddings for model
        "product_embedding": json.dumps(product_emb_raw.tolist()),
        "query_intent_similarity": semantic_similarity,

        # Relevance is simulated: give higher relevance to brand_match & clicked/add_to_cart
        "relevance": int( (event == "clicked" or event == "add_to_cart" or brand_match) + random.choice([0,1]) )
    }

    return row

def generate_large_dataset():
    print("Starting dataset generation...")
    rows = []
    for user_id in range(NUM_USERS):
        for session_id_num in range(SESSIONS_PER_USER):
            session_id_str = session_id_num
            for query_num in range(QUERIES_PER_SESSION):
                query = random.choice(queries)
                persona_tag = random.choice(persona_tags)
                preferred_brands = generate_user_brand_preferences()
                for product_i in range(PRODUCTS_PER_QUERY):
                    row = generate_row(user_id, session_id_str, query, persona_tag, preferred_brands, product_i)
                    rows.append(row)

                if (len(rows) % 5000) == 0:
                    print(f"Generated {len(rows)} rows so far...")

    df = pd.DataFrame(rows)
    print(f"Generated dataset with shape: {df.shape}")

    # Save to CSV (big file)
    df.to_csv("master_data_large.csv", index=False)
    print("Saved to master_data_large.csv")

if __name__ == "__main__":
    generate_large_dataset()


Loading embedding model 'all-MiniLM-L6-v2' once ...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting dataset generation...
Generated 5000 rows so far...
Generated 10000 rows so far...
Generated 15000 rows so far...
Generated 20000 rows so far...
Generated 25000 rows so far...
Generated 30000 rows so far...
Generated 35000 rows so far...
Generated 40000 rows so far...
Generated 45000 rows so far...
Generated 50000 rows so far...
Generated 55000 rows so far...
Generated 60000 rows so far...
Generated 65000 rows so far...
Generated 70000 rows so far...
Generated 75000 rows so far...
Generated 80000 rows so far...
Generated dataset with shape: (80000, 40)
Saved to master_data_large.csv


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json
import warnings

warnings.filterwarnings('ignore')

PATH_MASTER_DATA = "master_data_large.csv"  # Use your large dataset CSV here
OUTPUT_MODEL_PATH = "lgbm_model_large.txt"

# Mapping for offer_strength
OFFER_STRENGTH_MAP = {'Low': 1, 'Medium': 2, 'High': 3}

def parse_embedding_column(embedding_series):
    """
    Parses stringified list embeddings into numpy arrays,
    then returns a DataFrame of embedding dimensions.
    """
    def to_list(x):
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return json.loads(x)
            except json.JSONDecodeError:
                # Fallback: parse simple string list
                x = x.strip("[]")
                return [float(i) for i in x.split(',') if i]
        elif isinstance(x, list):
            return x
        else:
            return []

    embeddings = embedding_series.apply(to_list)
    embedding_dim = 0
    for emb in embeddings:
        if isinstance(emb, list) and len(emb) > 0:
            embedding_dim = len(emb)
            break

    if embedding_dim == 0:
        # no valid embeddings
        return pd.DataFrame(np.zeros((len(embedding_series), 1)), columns=['embedding_zero'])

    emb_matrix = np.zeros((len(embedding_series), embedding_dim))
    for i, emb in enumerate(embeddings):
        if len(emb) == embedding_dim:
            emb_matrix[i] = emb
        elif len(emb) < embedding_dim:
            emb_matrix[i, :len(emb)] = emb
        else:
            emb_matrix[i] = emb[:embedding_dim]

    return pd.DataFrame(emb_matrix, columns=[f'emb_dim_{i}' for i in range(embedding_dim)])

def prepare_features(df):
    # Encode categorical variables
    for col in ['persona_tag', 'brand', 'event']:
        if col in df.columns:
            df[col], _ = pd.factorize(df[col])
        else:
            df[col] = 0

    # preferred_brands: count number of preferred brands (comma-separated)
    if 'preferred_brands' in df.columns:
        df['preferred_brands_count'] = df['preferred_brands'].fillna("").apply(
            lambda x: len(str(x).split(',')) if x else 0)
    else:
        df['preferred_brands_count'] = 0

    # Map offer_strength string to numeric
    if 'offer_strength' in df.columns:
        df['offer_strength_numeric'] = df['offer_strength'].map(OFFER_STRENGTH_MAP).fillna(0).astype(int)
    else:
        df['offer_strength_numeric'] = 0

    # Convert is_f_assured and cod_available to int
    for bcol in ['is_f_assured', 'cod_available']:
        if bcol in df.columns:
            df[bcol] = df[bcol].astype(str).str.lower().map({'true':1, 'false':0, '0':0, '1':1, 'yes':1, 'no':0}).fillna(0).astype(int)
        else:
            df[bcol] = 0

    # Numeric columns to ensure numeric dtype and fillna=0
    numeric_cols = [
        'avg_price_last_k_clicks',
        'preferred_brands_count',
        'session_length',
        'query_frequency',
        'price',
        'rating',
        'click_count',
        'brand_match',
        'price_gap_to_avg',
        'offer_preference_match',
        'semantic_similarity',
        'review_count',
        'seller_rating',
        'current_price'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            df[col] = 0

    # Parse query_intent_vector embeddings and reduce to mean float column
    if 'query_intent_vector' in df.columns:
        emb_df = parse_embedding_column(df['query_intent_vector'])
        df['query_intent_vector_mean'] = emb_df.mean(axis=1)
    else:
        df['query_intent_vector_mean'] = 0

    # Parse product_embedding and reduce to mean float column
    if 'product_embedding' in df.columns:
        emb_df = parse_embedding_column(df['product_embedding'])
        df['product_embedding_mean'] = emb_df.mean(axis=1)
    else:
        df['product_embedding_mean'] = 0

    # query_intent_similarity numeric
    if 'query_intent_similarity' in df.columns:
        df['query_intent_similarity'] = pd.to_numeric(df['query_intent_similarity'], errors='coerce').fillna(0)
    else:
        df['query_intent_similarity'] = 0

    return df

def train_ranking_model():
    print("Loading dataset...")
    try:
        df = pd.read_csv(PATH_MASTER_DATA)
    except FileNotFoundError:
        print(f"❌ ERROR: '{PATH_MASTER_DATA}' not found.")
        return

    print(f"Loaded dataset: {len(df)} rows, {len(df.columns)} columns.")

    df = prepare_features(df)

    # List of features as per your defined Session/User, Product, Interaction features:
    feature_columns = [
        # Session/User Features
        'persona_tag',
        'avg_price_last_k_clicks',
        'preferred_brands_count',
        'session_length',
        'query_intent_vector_mean',
        'event',

        'query_frequency',

        # Product Features
        'price',
        'brand',
        'offer_strength_numeric',
        'rating',
        'click_count',
        'is_f_assured',

        # Interaction Features
        'brand_match',
        'price_gap_to_avg',
        'offer_preference_match',
        'semantic_similarity',

        # Extra embeddings and similarity features
        'query_intent_similarity',
        'product_embedding_mean'
    ]

    missing_features = [f for f in feature_columns if f not in df.columns]
    if missing_features:
        print("❌ Missing features from dataset:", missing_features)
        return

    X = df[feature_columns]
    y = df['relevance'].astype(int)

    # Ensure grouping columns exist
    for grp_col in ['user_id', 'session_id', 'query']:
        if grp_col not in df.columns:
            print(f"❌ ERROR: Column '{grp_col}' missing in dataset, cannot create groups.")
            return

    groups = df.groupby(['user_id', 'session_id', 'query']).size().to_list()
    print(f"Number of groups: {len(groups)} | Total rows: {len(df)}")

    print("Training LightGBM LTR ranker...")
    ranker = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
    )

    ranker.fit(
        X,
        y,
        group=groups,
        eval_set=[(X, y)],
        eval_group=[groups],
        eval_at=[5],
        callbacks=[lgb.early_stopping(10, verbose=True)]
    )

    print(f"Saving trained model to '{OUTPUT_MODEL_PATH}'")
    ranker.booster_.save_model(OUTPUT_MODEL_PATH)

    print("\nFeature importances:")
    feat_imp = pd.DataFrame({'feature': ranker.feature_name_, 'importance': ranker.feature_importances_})
    print(feat_imp.sort_values('importance', ascending=False))

if __name__ == "__main__":
    train_ranking_model()


Loading dataset...
Loaded dataset: 80000 rows, 40 columns.
Number of groups: 3896 | Total rows: 80000
Training LightGBM LTR ranker...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2355
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 19
Saving trained model to 'lgbm_model_large.txt'

Feature importances:
                     feature  importance
10                    rating        1579
11               click_count        1515
1    avg_price_last_k_clicks        1486
18    product_embedding_mean        1483
6            query_frequency        1476
16       semantic_similarity        1432
7                      price        1270
14          price_gap_to_avg        1180
4   query_intent_vector_mean         732
3             session_length      

In [None]:
import pandas as pd
import lightgbm as lgb

# Path configs - adjust if needed
DATA_PATH = "master_data_large.csv"        # Your full dataset CSV path
MODEL_PATH = "lgbm_model_large.txt"        # Your trained LightGBM model path

# Features list used in training (must exactly match your trained model)
FEATURE_COLUMNS = [
    'persona_tag',
    'avg_price_last_k_clicks',
    'preferred_brands_count',
    'session_length',
    'query_intent_vector_mean',
    'event',
    'query_frequency',
    'price',
    'brand',
    'offer_strength_numeric',
    'rating',
    'click_count',
    'is_f_assured',
    'brand_match',
    'price_gap_to_avg',
    'offer_preference_match',
    'semantic_similarity',
    'query_intent_similarity',
    'product_embedding_mean'
]

def encode_categorical(df):
    # Factorize categoricals exactly as training
    for col in ['persona_tag', 'brand', 'event']:
        if col in df.columns:
            df[col], _ = pd.factorize(df[col])
        else:
            df[col] = 0
    return df

def prepare_features(df):
    # Compute preferred_brands_count from the comma-separated brands string
    if 'preferred_brands' in df.columns:
        df['preferred_brands_count'] = df['preferred_brands'].fillna("").apply(lambda x: len(str(x).split(',')) if x else 0)
    else:
        df['preferred_brands_count'] = 0

    # Ensure all feature columns are present and numeric
    missing_cols = [col for col in FEATURE_COLUMNS if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing feature columns: {missing_cols}")

    for col in FEATURE_COLUMNS:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    return df

def predict_top_products_for_query(query_str, top_k=10):
    # Load data and model
    data = pd.read_csv(DATA_PATH)
    model = lgb.Booster(model_file=MODEL_PATH)

    # Filter candidate rows where 'query' matches input
    candidates = data[data['query'] == query_str].copy()

    if candidates.empty:
        print(f"No products found for query: '{query_str}'")
        return

    # Preprocess features consistent with training
    candidates = encode_categorical(candidates)
    candidates = prepare_features(candidates)

    # Prepare features for prediction
    X_test = candidates[FEATURE_COLUMNS]

    # Predict scores and rank
    preds = model.predict(X_test)
    candidates['score'] = preds
    candidates['rank'] = candidates['score'].rank(ascending=False, method='first')

    # Sort and show top_k results
    top_results = candidates.sort_values('rank').head(top_k)

    # Columns to display for user-friendliness
    display_cols = ['product_id', 'title', 'brand', 'price', 'score', 'rank']

    print(f"\nTop {top_k} products for query: '{query_str}'\n")
    print(top_results[display_cols].to_string(index=False))

if __name__ == "__main__":
    user_query = input("Enter user query to fetch top 10 products: ").strip()
    predict_top_products_for_query(user_query)


Enter user query to fetch top 10 products: watch for women
No products found for query: 'watch for women'


In [None]:
import pandas as pd

# Load your large dataset
df = pd.read_csv('master_data_large.csv')

# Search for 'watch' substring in 'title' or 'category' columns (case insensitive)
mask_title = df['title'].str.contains('watch', case=False, na=False)
mask_category = df['category'].str.contains('watch', case=False, na=False)

# Combine the masks
watch_products = df[mask_title | mask_category]

print(f"Total products containing 'watch' in title or category: {len(watch_products)}")

# Show some sample rows if available
if len(watch_products) > 0:
    print(watch_products[['product_id', 'title', 'brand', 'category', 'price']].head(10))
else:
    print("No watch-related products found in the dataset.")


Total products containing 'watch' in title or category: 0
No watch-related products found in the dataset.


In [None]:
import pandas as pd

# Path to your dataset
path = "master_data_large.csv"

# Load the dataset
df = pd.read_csv(path)

# Get unique queries as a list
unique_queries = df['query'].unique().tolist()

print(f"Total unique queries: {len(unique_queries)}")
print("List of unique queries:")
for q in unique_queries:
    print(q)


Total unique queries: 20
List of unique queries:
red shoes
gaming laptop
fitness tracker
wireless headphones
smartphone under 15000
office chair
running shoes
wireless mouse
digital camera
LED TV
kitchen mixer
cotton kurti
men's watches
women's handbags
kids sneakers
summer dress
laptop sleeve
bluetooth speaker
puma t-shirt
formal shirt


NEW DATASET

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import ast

# Load the product catalog
df = pd.read_csv("product_catalog.csv")
df.fillna("", inplace=True)

# Convert boolean-like fields
def normalize_bool(x):
    return 1 if str(x).strip().lower() in ["yes", "true", "1"] else 0

df["is_f_assured"] = df["is_f_assured"].apply(normalize_bool)
df["cod_available"] = df["cod_available"].apply(normalize_bool)

# Define embedding model (MiniLM is fast and good)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Construct text representation for each product
text_fields = (
    df["title"] + " " +
    df["tags"] + " " +
    df["description"] + " " +
    df["brand"] + " " +
    df["category"] + " " +
    df["subcategory"] + " " +
    df["sub_subcategory"]
).str.strip()

# Generate semantic embeddings
print("🧠 Generating embeddings...")
embeddings = model.encode(text_fields.tolist(), show_progress_bar=True)
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Save .npy for FAISS
np.save("product_embeddings.npy", normalized_embeddings)

# Add embedding as list column
df["product_embedding"] = [list(vec) for vec in normalized_embeddings]

# Select relevant columns for index
output_df = df[[
    "product_id", "title", "brand", "category", "subcategory", "sub_subcategory",
    "price", "color", "is_f_assured", "cod_available", "return_policy",
    "seller_name", "seller_rating", "tags", "product_embedding"
]]

# Save to CSV
output_df.to_csv("product_vector_index.csv", index=False)
print("✅ Saved product_vector_index.csv and product_embeddings.npy")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🧠 Generating embeddings...


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

✅ Saved product_vector_index.csv and product_embeddings.npy


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import random

# Load required files
query_df = pd.read_csv("user_queries.csv")
product_df = pd.read_csv("product_vector_index.csv")
product_embeddings = np.load("product_embeddings.npy")

# Create a mapping from product_id to vector
product_id_to_index = {pid: idx for idx, pid in enumerate(product_df["product_id"].tolist())}

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Utility: cosine similarity
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Prepare training data
rows = []

print("🔍 Generating training pairs...")
for _, row in tqdm(query_df.iterrows(), total=len(query_df)):
    query_id = row["query_id"]
    raw_query = str(row["raw_query"])
    clicked_pid = str(row.get("clicked_product_id", ""))
    purchased = str(row.get("purchased", "no")).strip().lower() == "yes"
    frequency = row.get("frequency", 1)
    event = row.get("event", "click")

    # For now, assume raw_query = corrected_query
    corrected_query = raw_query

    # Embed the query
    q_embedding = model.encode(corrected_query)
    q_embedding = q_embedding / np.linalg.norm(q_embedding)

    # Positive example
    if clicked_pid in product_id_to_index:
        idx = product_id_to_index[clicked_pid]
        p_embedding = product_embeddings[idx]
        sim = cosine_sim(q_embedding, p_embedding)
        product_row = product_df.iloc[idx]
        rows.append({
            "query_id": query_id,
            "raw_query": raw_query,
            "corrected_query": corrected_query,
            "product_id": clicked_pid,
            "label": 1,
            "click": 1,
            "purchase": int(purchased),
            "embedding_similarity": sim,
            "query_frequency": frequency,
            "query_event": event,
            "product_brand": product_row["brand"],
            "product_category": product_row["category"],
            "product_subcategory": product_row["subcategory"],
            "product_title": product_row["title"],
            "product_price": product_row["price"],
            "is_f_assured": product_row["is_f_assured"],
            "seller_rating": product_row["seller_rating"]
        })

    # Negative examples: Random 3 non-clicked products
    for _ in range(3):
        neg_idx = random.randint(0, len(product_df) - 1)
        neg_pid = product_df.iloc[neg_idx]["product_id"]
        if neg_pid == clicked_pid:
            continue
        p_embedding = product_embeddings[neg_idx]
        sim = cosine_sim(q_embedding, p_embedding)
        product_row = product_df.iloc[neg_idx]
        rows.append({
            "query_id": query_id,
            "raw_query": raw_query,
            "corrected_query": corrected_query,
            "product_id": neg_pid,
            "label": 0,
            "click": 0,
            "purchase": 0,
            "embedding_similarity": sim,
            "query_frequency": frequency,
            "query_event": event,
            "product_brand": product_row["brand"],
            "product_category": product_row["category"],
            "product_subcategory": product_row["subcategory"],
            "product_title": product_row["title"],
            "product_price": product_row["price"],
            "is_f_assured": product_row["is_f_assured"],
            "seller_rating": product_row["seller_rating"]
        })

# Save training dataset
train_df = pd.DataFrame(rows)
train_df.to_csv("query_product_training.csv", index=False)
print("✅ Saved query_product_training.csv")


🔍 Generating training pairs...


100%|██████████| 2158/2158 [00:44<00:00, 48.76it/s]


✅ Saved query_product_training.csv
