In [50]:
import os
import pandas as pd
import numpy as np
import json
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector, MaxMarginalRelevanceExampleSelector
# from langchain.prompts.example_selector.semantic_similarity import EmbeddingBasedExampleSelector
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.output_parsers import JsonOutputParser
from langchain_groq import ChatGroq
from prompt import Prompt
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
from accuracy import AccuracyMetric
import copy

In [3]:
load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

# Read the api key

In [4]:
groq_api_key = os.getenv('GROQ_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')


In [5]:
pt = Prompt()

## Create the Prompt Template

In [6]:
class Label(BaseModel):
    parent: str
    child: str

class Review(BaseModel):
    # review: str = Field(
    #     description="The text of the review to be analyzed."
    # )
    labels: List[Label] = Field(
        description="The labels or categories assigned to the review."
    )


review_parser = JsonOutputParser(pydantic_object=Review)

In [7]:
review_prompt = PromptTemplate.from_template(
    pt.label_prompt(),
    # partial_variables={"format_instructions": review_parser.get_format_instructions()}
)

In [8]:
# pip install -r requirements.txt
# ! pip install langchain-groq
# ! pip install sentence-transformers

In [8]:
# !pip install langchain langchainhub faiss-cpu groq openai

## Read the Train data

In [9]:
train_df = pd.read_excel("data/bodywash-train.xlsx", \
                        sheet_name="cbw_kwbase",\
                        usecols=['Core Item', 'Level 1 (PARENT)', 'Level 2 (CHILD)'])

In [10]:
# rows_to_drop = train_data.sample(n=5, random_state=42)
# valid_df = train_data.loc[rows_to_drop.index]
# train_df = train_data.drop(rows_to_drop.index)

## Convert the data into JSON

In [11]:
def convert_to_json(data):
    results = []
    for text, group in data.groupby('Core Item'):
        labels = []
        for _, row in group.iterrows():
            obj = dict()
            obj['parent'] = row['Level 1 (PARENT)']
            obj['child'] = row['Level 2 (CHILD)']
            labels.append(obj)
        results.append(
            {
                "review":text,
                "labels":labels
            }
        )
    return results

In [12]:
train_results = convert_to_json(train_df)
print(f"Train Results:{len(train_results)}")

Train Results:2790


## Split into Train and Valid Set

In [13]:
import random
valid_data = random.sample(train_results, 250)
# Remove valid_data from results to get the new train_data
valid_set = set([json.dumps(obj, sort_keys=True) for obj in valid_data])
train_data = [obj for obj in train_results if json.dumps(obj, sort_keys=True) not in valid_set]
with open("data/valid_data.json", "w") as f:
    json.dump(valid_data, f, indent=2)
with open("data/train_data_new.json", "w") as f:
    json.dump(train_data, f, indent=2)

In [14]:
print(f"Len of train data: {len(train_data)}")
print(f"Len of valid data: {len(valid_data)}")

Len of train data: 2540
Len of valid data: 250


In [15]:
with open("data/train_data_new.json", "r") as json_file:
    train_data = json.load(json_file)

## Format the Data for creating embeddings in FAISS

In [16]:
def format_data(data):
    docs = []
    formatted_examples = []
    for entry in data:
        text = entry["review"]
        labels = "; ".join([f"{label['parent']} > {label['child']}" for label in entry["labels"]])
        metadata = {"labels": labels}
        formatted_examples.append({
            "review": text,
            "labels": labels
        })  
        # docs.append(Document(page_content=text, metadata=metadata))

    return formatted_examples

In [21]:
formatted_examples = format_data(train_data)

## Initialize Embedding Model

In [20]:
# embeddings = OpenAIEmbeddings(openai_api_key=groq_api_key, model="text-embedding-3-small")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [None]:
# ## Create a Faiss Index
# db = FAISS.from_documents(docs, embeddings)

In [None]:
# db.save_local("faiss_index")

In [22]:
# query = "Love this!-Both my wife and use this! nice neutral scent!"
# test_response = db.similarity_search(query, k=2)
# for r in test_response:
#     print("Review:", r.page_content)
#     print("Labels:", r.metadata["labels"])

## Use Example Selector as a context to LLM

In [23]:
## Usig Example Selector
example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
    # The list of examples available to select from.
    formatted_examples,
    # The embedding class used to produce embeddings which are used to measure semantic similarity.
    embeddings,
    # The VectorStore class that is used to store the embeddings and do a similarity search over.
    FAISS,
    # The number of examples to produce.
    k=5,
)

In [24]:
example_prompt = PromptTemplate(
    input_variables=["review", "labels"],
    template="review: {review}\labels: {labels}",
)

In [25]:
similar_prompt = FewShotPromptTemplate(
    # We provide an ExampleSelector instead of examples.
    example_selector=example_selector,
    example_prompt=example_prompt,
    prefix="Give the labels for the following review:\n",
    suffix="review: {review}\nlabels:",
    input_variables=["review"],
)

In [26]:
def label_examples_rag_mmr(query):
    return similar_prompt.format(review=query)
# print(similar_prompt.format(review=query))

## Initialize LLM

In [27]:
llm = llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

## Get the Lables for Review

In [28]:
def get_labels(review):
    """
    Get labels for a given review using the example selector and prompt.
    Returns the response as a parsed JSON object.
    """

    examples = label_examples_rag_mmr(review)
    
    label_chain = review_prompt | llm | review_parser
    response = label_chain.invoke({"review": review, 
                                   "examples":examples})
    
    return response

In [18]:
# query="Amazing and best one out there!-Awesome!"
# label_obj = get_labels(query)

In [29]:
def get_labels_for_valid_data(valid_data):
    valid_pred = []
    for valid_review in tqdm(valid_data):
        text = valid_review["review"]
        cleaned_txt = text.replace('\n', '').replace('\r', '')
        label_obj = get_labels(text)
        # Filter out label objects with missing or empty parent/child
        filtered_labels = [
            obj for obj in label_obj["labels"]
            if (
                "parent" in obj and "child" in obj and
                obj["parent"].strip() != "" and
                obj["child"].strip() != ""
            )
        ]
        label_obj["labels"] = filtered_labels
        label_obj["review"] = cleaned_txt
        valid_pred.append(label_obj)
    return valid_pred

In [31]:
valid_pred_rslt = get_labels_for_valid_data(valid_data)

100%|██████████| 250/250 [36:16<00:00,  8.71s/it]


In [None]:
with open("data/valid_pred.json", "w") as f:
    json.dump(valid_pred_rslt, f, indent=2)

In [8]:
with open("data/valid_pred.json", "r") as json_file:
    valid_pred_rslt = json.load(json_file)

In [9]:
with open("data/valid_data.json", "r") as json_file:
    valid_data = json.load(json_file)

In [10]:
print(len(valid_pred_rslt))
print(len(valid_data))

250
250


In [11]:
_, format_valid_pred = format_data(valid_pred_rslt)

In [12]:
_, format_valid_data = format_data(valid_data)

## Accuracy Metric for Validation Set

In [13]:
acc = AccuracyMetric()

## Jaccard Similarity

In [14]:
accuracy, per_sample = acc.compute_classification_accuracy(format_valid_pred, format_valid_data)

print(f"\n📊 Average Jaccard Accuracy: {accuracy:.2%}")

love this stuff-love the smell.so fresh and so clean
love this stuff-love the smell.
so fresh and so clean

this was a complimentary gift for joining harry's which i think is an amazing idea for them to allow you to pick a product you would like to try for free which i haven't seen a company do before. my husband and son love using this body wash! it smells really good and cleans really well.this was a complimentary gift for joining harry's which i think is an amazing idea for them to allow you to pick a product you would like to try for free which i haven't seen a company do before. my husband and son love using this body wash! it smells really good and cleans really well.body skin concerns:stretch marks
this was a complimentary gift for joining harry's which i think is an amazing idea for them to allow you to pick a product you would like to try for free which i haven't seen a company do before. my husband and son love using this body wash! it smells really good and cleans really wel

In [15]:
# ! pip install rouge-score

## Rouge Score

In [16]:
rouge1, rouge2 = acc.compute_rouge_n(format_valid_pred, format_valid_data)

In [17]:
print(f"📊 ROUGE-1 F1: {rouge1:.2%}")
print(f"📊 ROUGE-2 F1: {rouge2:.2%}")

📊 ROUGE-1 F1: 40.67%
📊 ROUGE-2 F1: 30.29%


## Test set Prediction

In [39]:
test_df = pd.read_excel("data/bodywash-test.xlsx", \
                        sheet_name="Sheet1",\
                        usecols=["Core Item"])
test_df.rename(columns={"Core Item": "review"}, inplace=True)

In [40]:
test_df.shape

(216, 1)

In [45]:
test_data = test_df.to_dict(orient='records')

In [47]:
test_data_rslt = get_labels_for_valid_data(test_data)

100%|██████████| 216/216 [1:27:49<00:00, 24.40s/it]   


In [51]:
def clean_labels(data):
    cleaned = copy.deepcopy(data)
    for entry in cleaned:
        entry['labels'] = [
            label for label in entry['labels']
            if label.get('parent') not in [None, '', 'None'] and
               label.get('child') not in [None, '', 'None']
        ]
    return cleaned



In [52]:
# Cleaned result
cleaned_tst_rslt = clean_labels(test_data_rslt)

In [54]:
def convert_into_df(data):
    rows = []
    for entry in cleaned_tst_rslt:
        review = entry['review']
        for label in entry['labels']:
            rows.append({
                "review": review,
                "parent": label["parent"],
                "child": label["child"]
            })

    df = pd.DataFrame(rows)
    return df


In [55]:
test_rslt_df = convert_into_df(cleaned_tst_rslt)
test_rslt_df.to_csv("data/test_rslt.csv", index=False)

In [56]:
test_rslt_df

Unnamed: 0,review,parent,child
0,100 % for men to attract hot and beautiful wom...,Efficacy,Desired Results
1,100 % for men to attract hot and beautiful wom...,Fragrance,Masculine Fragrance
2,100 % for men to attract hot and beautiful wom...,Companion Approval,Spouse / Partner
3,100 % for men to attract hot and beautiful wom...,Self Esteem,Self Confidence
4,100 % for men to attract hot and beautiful wom...,Price,Value Justification
...,...,...,...
10662,You might just fall in love with it!! I know I...,Skin Care,Skin Tightening
10663,You might just fall in love with it!! I know I...,Skin Texture Improvement,Softness
10664,You might just fall in love with it!! I know I...,Convenience,Multi-Purpose
10665,You might just fall in love with it!! I know I...,Convenience,Companion Approval
