In [None]:
!pip install qdrant_client
!pip install langchain_groq
!pip install langchain-community
!pip install sentence_transformers
!pip install langchain-huggingface
!pip install -q transformers accelerate bitsandbytes peft

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


In [None]:
# DEPENDENCIES

import json
import random
import pandas as pd

from flask import Flask
from flask import request
from flask import jsonify
from flask import render_template

from langchain_groq import ChatGroq
from langchain.llms import HuggingFacePipeline
from langchain_huggingface import HuggingFacePipeline

from qdrant_client.http import models
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer

from huggingface_hub import login
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM

ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/google/colab/_debugpy_repr.py", line 54, in get_shape
    shape = getattr(obj, 'shape', None)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/werkzeug/local.py", line 318, in __get__
    obj = instance._get_current_object()
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/werkzeug/local.py", line 519, in _get_current_object
    raise RuntimeError(unbound_message) from None
RuntimeError: Working outside of request context.

This typically means that you attempted to use functionality that needed
an active HTTP request. Consult the documentation on testing for
information about how to avoid this problem.
ERROR:root:Unexpected exception finding object shape
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/google/colab/_debugpy_repr.py", line 5

In [None]:
# CREDENTIALS

GROQ_API_KEY              = "YOUR_GROQ_API_KEY"
LLM_MODEL_NAME            = "YOUR_LLM_MODEL_NAME"
HF_LLM_MODEL_NAME         = "YOUR_HF_LLM_MODEL_NAME"
QDRANT_API_KEY            = "YOUR_QDRANT_API_KEY"
QDRANT_CLUSTER_URL        = "YOUR_QDRANT_CLUSTER_URL"
QDRANT_COLLECTION_NAME    = "YOUR_QDRANT_COLLECTION_NAME"
HUGGINGFACE_LOGIN_TOKEN   = "YOUR_HUGGINGFACE_LOGIN_TOKEN"

# LOGIN TOKEN FOR HUGGING FACE
login("HUGGINGFACE_LOGIN_TOKEN")


In [None]:
# INITIALIZING THE QDRANT CLIENT
client                  = QdrantClient(url      = QDRANT_CLUSTER_URL,
                                       api_key  = QDRANT_API_KEY
                                       )

In [None]:
# LOAD THE DATASET AND CONVERT IT INTO PANDAS DATAFRAME

df = pd.read_csv('/content/V-1.01_Updated_Fashion_Dataset.csv')
print(df.head())

In [None]:
# MODEL EMBEDDING
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    return model.encode(text)

df['embedding'] = df.apply(lambda row: get_embedding(
    f"{row['name']} {row['size']} {row['Category']} {row['Individual_category']} {row['category_by_Gender']} {row['brand']}"), axis=1)


In [None]:
# VECTOR EMBEDDINGS CONFIGURATIONS
vector_size = len(df['embedding'][0])

# Create a new collection
client.recreate_collection(collection_name   = QDRANT_COLLECTION_NAME,
                           vectors_config    = models.VectorParams(size      = vector_size,
                                                                   distance  = models.Distance.COSINE
                                                                   ),
                           )


In [None]:
# PREPARING PAYLOAD WITH META DATA

payloads = df.apply(lambda row: {"name"                 : row['name'],
                                 "price"                : row['price'],
                                 "colour"               : row['colour'],
                                 "brand"                : row['brand'],
                                 "img"                  : row['img'],
                                 "ratingCount"          : row['ratingCount'],
                                 "avg_rating"           : row['avg_rating'],
                                 "description"          : row['description'],
                                 "Category"             : row['Category'],
                                 "Individual_category"  : row['Individual_category'],
                                 "category_by_Gender"   : row['category_by_Gender'],
                                 "size"                 : row['size']
                                 },
                    axis = 1).tolist()

# UPLOADING THE VECTORS TO THE COLLECTION
client.upload_collection(collection_name  = QDRANT_COLLECTION_NAME,
                         vectors          = df['embedding'].tolist(),
                         payload          = payloads,
                         ids              = None # QDRANT WILL GENERATE AUTOMATIC ID
                         )


In [None]:
# RETRIEVING SOME VECTORS TO CHECK
result = client.scroll(collection_name   = QDRANT_COLLECTION_NAME,
                       limit             = 5
                       )
print(result)

In [None]:
# RETRIEVE THE FIRST 5 VECTORS FROM THE COLLECTION
scroll_result, _ = client.scroll(collection_name  = QDRANT_COLLECTION_NAME,
                                 limit            = 5
                                 )

# DISPLAY THE VECTORS AND THEIR PAYLOADS
for point in scroll_result:
    print("Payload:", point.payload)
    print("-----")


In [None]:
# SEARCH FUNCTION USING QDRANT CLIENT

def search_collection(colour               : str = "NA",
                      individual_category  : str = "NA",
                      category             : str = "NA",
                      category_by_gender   : str = "NA"
                      ) -> list:
    """
    Search and retrieve items from a Qdrant collection based on optional filtering attributes.

    This function scrolls through all points in the specified Qdrant collection and
    applies dynamic filters (colour, individual_category, category, category_by_gender).
    If filters are provided, only matching items are returned. If no filters are provided,
    10 random items are sampled and returned. If filters yield no results, 10 random
    items are returned as a fallback.

    Arguments:

        - `colour`                {str, optional}      : Filter results by the `colour` attribute.
                                                         If "NA", this filter is ignored.

        - `individual_category`   {str, optional}      : Filter results by the `Individual_category` attribute.
                                                         If "NA", this filter is ignored.

        - `category`              {str, optional}      : Filter results by the `Category` attribute.
                                                         If "NA", this filter is ignored.

        - `category_by_gender`    {str, optional}      : Filter results by the `category_by_Gender` attribute.
                                                         If "NA", this filter is ignored.

    Returns

        - `results`                   {list}           : A list of payload dictionaries representing the filtered or
                                                         randomly sampled items from the collection.
    """

    # CONSTRUCT FILTER LOGIC BASED ON THE ATTRIBUTES
    filters                  = []

    if colour               != "NA":
        filters.append(lambda point: point.payload.get("colour") == colour)

    if individual_category  != "NA":
        filters.append(lambda point: point.payload.get("Individual_category") == individual_category)

    if category             != "NA":
        filters.append(lambda point: point.payload.get("Category") == category)

    if category_by_gender   != "NA":
        filters.append(lambda point: point.payload.get("category_by_Gender") == category_by_gender)

    # RETRIEVE AND FILTER POINTS
    all_points               = []
    next_page                = None

    while True:
        response, next_page  = client.scroll(collection_name  = QDRANT_COLLECTION_NAME,
                                             limit            = 1000,
                                             offset           = next_page
                                             )
        all_points.extend(response)

        if not next_page:
            break

    # FILTERED POINTS
    if filters:
        filtered_points = [point for point in all_points
                           if all(f(point) for f in filters)
                           ]

    # RANDOM 10 POINTS IF NO FILTERS IS APPLIED
    else:
        filtered_points = random.sample(all_points, min(10, len(all_points)))

    # OUTPUT OF THE RESULTS
    if filtered_points:
        results         = [point.payload for point in filtered_points]

    else:
        random_points   = random.sample(all_points, min(10, len(all_points)))
        results         = [point.payload for point in random_points]

    return results

In [None]:
# INITIALISING THE CHATGROQ LLM
llm                     = ChatGroq(temperature    = 0.5,
                                   groq_api_key   = GROQ_API_KEY,
                                   model_name     = LLM_MODEL_NAME
                                   )

In [None]:
# HUGGING FACE LLM CONFIGURATIONS

# QUANTIZATION CONFIGURATION (4-BIT)
bnb_config   = BitsAndBytesConfig(load_in_4bit                = True,
                                  bnb_4bit_use_double_quant   = True,
                                  bnb_4bit_quant_type         = "nf4",
                                  bnb_4bit_compute_dtype      = "bfloat16"
                                  )

# LOADING TOKENIZER
tokenizer    = AutoTokenizer.from_pretrained(HF_LLM_MODEL_NAME)

# LOADING QUANTIZED MODEL
model        = AutoModelForCausalLM.from_pretrained(HF_LLM_MODEL_NAME,
                                                    quantization_config  = bnb_config,
                                                    device_map           = "auto"
                                                    )

# BUILDING PIPELINE
pipe         = pipeline("text-generation",
                        model            = model,
                        tokenizer        = tokenizer,
                        max_new_tokens   = 512,
                        temperature      = 0.5,
                        do_sample        = True
                        )

# WRAP INTO LANGCHAIN HUGGING-FACE PIPELINE
llm         = HuggingFacePipeline(pipeline = pipe)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# EXTRACTOR FUNCTION USING SYSTEM PROMPT TECHNIQUE

def extractor(llm : object, conversation_history : str) -> str:
    """
    Extract structured product attributes from a fashion e-commerce conversation history.

    This function uses a language model (LLM) to analyze a customer–agent conversation
    and infer key attributes of the primary product request. It focuses only on predefined
    categories such as Category, Individual Category, Gender, and Colour. It also decides
    whether enough information has been collected to proceed with product searching or if
    a follow-up query is required.

    Arguments:

        - `llm`                       {object}     : The language model instance (must support `.invoke(prompt)`).

        - `conversation_history`        {str}      : The conversation text containing customer queries and context.

    Returns:

        - str
            A structured response from the LLM containing:
            - Category
            - Individual_category
            - category_by_Gender
            - colour
            - MOVE_ON (true/false)
            - FOLLOW_UP_MESSAGE (context-aware confirmation or follow-up question)
    """

    prompt = f'''
    ## CONTEXT ##
    Analyze the following Fashion e-commerce conversation history:
    {conversation_history}

    ## TASK ##
    Extract and infer relevant information about the customer's primary product request, focusing only on the parameters specified below. If multiple products are mentioned, focus on the first or main product. Make reasonable assumptions based on context, but do not introduce information outside the given categories.

    ## GUIDELINES ##
    1. Category: Choose ONE from Indian Wear, Plus Size, Western, Sports Wear, Inner Wear & Sleep Wear, Lingerie & Sleep Wear. If none fit, use "Other". If multiple categories apply, choose the most relevant for the main product.
    2. Individual Category: Choose ONE from kurta-sets, kurtas, tops, thermal-tops, jeans, skirts, shorts, trousers, palazzos, jumpsuit, co-ords, clothing-set, kurtis, tunics. If none fit, use "Other". This should correspond to the main product if multiple are mentioned.
    3. Category by Gender: Choose Women or Men. If unclear, use your best judgment based on the conversation.
    4. Colour: Choose from Black, Orange, Navy Blue, Red, Beige, Yellow, Green, Mustard, Teal, Peach, Blue, Sea Green, Pink, Burgundy, Maroon, Lavender, Purple, White, Grey, Lime Green, Brown, Cream, Rust, Off White, Turquoise Blue, Multi, Mauve, Assorted, Magenta, Fuchsia, Coral, Olive, Rose, Gold, Fluorescent Green, Silver, Nude, Violet, Charcoal, Grey Melange, Khaki, Coffee Brown, Taupe, Copper. If the color isn't listed or multiple colors are mentioned, use "Other" or the color of the main product.
    5. Move On: Determine if enough key information (at least Category, Individual Category, and one of either Colour or Category by Gender) has been gathered for the main product to proceed to product searching. Use "true" only if these are available, otherwise "false".
    6. Follow-up Message:
       - If Move On is "true", provide a confirmation message to proceed with searching for the main product.
       - If Move On is "false", ask a question to gather missing key information (Category, Individual Category, Colour, or Category by Gender) for the main product.
       - If multiple products were mentioned, acknowledge this in your follow-up message and confirm focus on the main product.
       - Phrase questions to elicit specific, relevant information.
       - If any other product other than fashion is mentioned then give an appropriate error message. As we can only show fashion products.

    ## IMPORTANT NOTES ##
    - If any other product other than fashion is mentioned then give an appropriate error message. As we can only show fashion products.
    - If multiple products are mentioned, focus on extracting information for the first or main product mentioned.
    - Stick strictly to the categories provided. Do not invent or introduce new parameters.
    - If information for a category is not available and can't be reasonably inferred, use "NA".

    ## OUTPUT FORMAT ##
    Respond with the information in the following format:

    Category: "Extracted or inferred category for main product"
    Individual_category: "Extracted or inferred individual category for main product"
    category_by_Gender: "Extracted or inferred gender category"
    colour: "Extracted or inferred colour for main product"
    MOVE_ON: "true" or "false"
    FOLLOW_UP_MESSAGE: "Your context-aware follow-up message"

    Your Input: {conversation_history}
    Your output:
    '''
    response = llm.invoke(prompt)

    # return response.content ## FOR CHATGROQ

    if isinstance(response, str):
        return response.strip()

    else:
        return str(response)

In [None]:
def parser(response : str) -> str:
    """
    Parse the structured LLM response into a standardized dictionary format.

    This function processes the raw text output from the `extractor` function (or LLM)
    and extracts predefined keys (Category, Individual_category, category_by_Gender,
    colour, MOVE_ON, FOLLOW_UP_MESSAGE). It ensures missing fields are defaulted to "NA",
    and converts the `MOVE_ON` field into a boolean.

    Arguments:

        - `response`         {str}      : The raw string response from the LLM, containing key-value pairs.

    Returns:

        - dict
            A dictionary with the following keys:
            - "Category"               : str
            - "Individual_category"    : str
            - "category_by_Gender"     : str
            - "colour"                 : str
            - "MOVE_ON"                : bool
            - "FOLLOW_UP_MESSAGE"      : str
    """

    parsed_data = {"Category"             : "NA",
                   "Individual_category"  : "NA",
                   "category_by_Gender"   : "NA",
                   "colour"               : "NA",
                   "MOVE_ON"              : "false",
                   "FOLLOW_UP_MESSAGE"    : "NA"
                   }


    current_key                  = None

    for line in response.split('\n'):
        line                     = line.strip()

        if ':' in line:
            key, value           = line.split(':', 1)
            key                  = key.strip()
            value                = value.strip().strip('"')

            if key in parsed_data:
                parsed_data[key] = value
                current_key      = key

        elif current_key:
            parsed_data[current_key] += ' ' + line.strip('"')

    parsed_data["MOVE_ON"] = parsed_data["MOVE_ON"].lower() == "true"

    return parsed_data

In [None]:
# EXTRACTING FEATURES FROM THE PROMPT

conversation          = "I need black women jeans"
extracted_attributes  = extractor(llm,conversation)

response              = parser(extracted_attributes)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'Category': 'Western', 'Individual_category': 'jeans', 'category_by_Gender': 'Women', 'colour': 'Black', 'MOVE_ON': True, 'FOLLOW_UP_MESSAGE': 'I can show you black women jeans. Is that what you were looking for?'}


In [None]:
search_collection(colour               = response["colour"],
                  individual_category  = response["Individual_category"],
                  category             = response["Category"],
                  )

[{'name': 'Roadster Women Black Boyfriend Fit Light Fade Stretchable Jeans',
  'price': 2399,
  'colour': 'Black',
  'brand': 'Roadster',
  'img': 'http://assets.myntassets.com/assets/images/14954702/2022/2/24/30acbad6-7e07-4ee0-b31b-b3bb32120c0d1645686658752-Roadster-Women-Black-Boyfriend-Fit-Light-Fade-Stretchable-Je-1.jpg',
  'ratingCount': 32,
  'avg_rating': 4.1875,
  'description': "<ul> <li> Dark shade,  light fade black jeans </li> <li> Boyfriend fit,  mid-rise </li> <li> Clean look </li> <li> Stretchable </li> <li> 5 pocket </li> <li> Length: regular </li> </ul>99% cotton, 1% elastane<br>Machine washFit: Boyfriend Fit<br>Stretchable<br>The model (height 5'8) is wearing a size 28",
  'Category': 'Western',
  'Individual_category': 'jeans',
  'category_by_Gender': 'Women',
  'size': '26,28,30,32,34,36'},
 {'name': 'People Women Black Skinny Fit Slash Knee Jeans',
  'price': 999,
  'colour': 'Black',
  'brand': 'People',
  'img': 'http://assets.myntassets.com/assets/images/159118