In [None]:
# Install python dependencies. Run Once.
# MongoDB reccomends `python -m pip install "pymongo[srv]"==3.11` or whatever version of python you are using
%pip install pymongo
%pip install pypdf
%pip install langchain
%pip install langchain_community
%pip install langchain_openai
%pip install python-aiconfig


## Search intent prediction from the keywords 

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI

from langchain.llms import CTransformers

def get_embedding_model(model_name: str):

    if model_name.startswith("sentence-transformers/"):
        model_kwargs = {'device': 'cpu'}
        encode_kwargs = {'normalize_embeddings': False}
        embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )
        return embeddings
    elif model_name == 'openAI':
        embeddings = OpenAIEmbeddings()
        return embeddings
    else:
        raise ValueError('Embedding model {} is not supported'.format(model_name))


def get_generative_model(model_name: str):

    if "Llama-2" in model_name:
        config = {'max_new_tokens': 100, 'temperature': 0.2, 'context_length': 2048}
        #config = {'max_new_tokens': 200, 'temperature': 0.2, 'context_length': 4096}
        model = CTransformers(model=model_name, config=config)
        return model
    elif model_name == 'openAI':
        model = OpenAI(temperature=0)
        return model
    else:
        raise ValueError('Generative model {} is not supported'.format(model_name))

## Step 1: Setup your MongoDB Database Connection

In [2]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Setup up MongoDB Atlas connection
# You can find this connection URI by going to your cluster and clicking connect > Drivers
# Connection String looks something like this: "mongodb+srv://ankush:ankush@cluster0.zqgx7e6.mongodb.net/?retryWrites=true&w=majority"
MONGODB_ATLAS_CLUSTER_URI = "mongodb+srv://<<user>>:<<password>>@mongo-big-house.jhrfrf8.mongodb.net/?retryWrites=true&w=majority"


# Create a new client and connect to the server
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


## Step 2: Configure Database Details
The database will contain vectorized embeddings of your original document (e.g. PDF or webpage). 

In [3]:
## Setup DB details.
## Create a new Database
DB_NAME = "aig-db"
db = client[DB_NAME]

## Create a new collection
COLLECTION_NAME = "products"
MONGODB_COLLECTION = db[COLLECTION_NAME]

## Index Name. Manually create this in the MongoDB web ui, add documents below, then create the search index in the same name
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

## Load the data

In [9]:
import pandas as pd
raw_data_df = pd.read_csv("../data/pricerunner_aggregate.csv")
# Dedupe the data based on the title
deduped_data_df = raw_data_df.drop_duplicates(['Title'])
cols = ['Title', 'Category_Label']
deduped_data_df['text'] = deduped_data_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deduped_data_df['text'] = deduped_data_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


In [10]:
deduped_data_df

Unnamed: 0,PID,Title,Merchant_ID,Cluster_ID,Cluster_Label,Category_ID,Category_Label,text
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones,apple iphone 8 plus 64gb silver Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones,apple iphone 8 plus 64 gb spacegrau Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones,apple iphone 8 plus 64gb space grey Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...
...,...,...,...,...,...,...,...,...
35306,47350,smeg fab28 60cm retro style right hand hinge f...,59,47517,Smeg FAB28 Cream,2623,Fridges,smeg fab28 60cm retro style right hand hinge f...
35307,47351,smeg fab28 60cm retro style left hand hinge fr...,59,47518,Smeg FAB28 Red,2623,Fridges,smeg fab28 60cm retro style left hand hinge fr...
35308,47352,smeg fab28 60cm retro style left hand hinge fr...,59,47519,Smeg FAB28 Pink,2623,Fridges,smeg fab28 60cm retro style left hand hinge fr...
35309,47355,candy 60cm built under larder fridge cru160nek,125,47524,Candy CRU16.0,2623,Fridges,candy 60cm built under larder fridge cru160nek...


In [None]:
+----------------+
|Category_Label  |
+----------------+
|Microwaves      |
|Fridges         |
|TVs             |
|Washing Machines|
|Freezers        |
|Digital Cameras |
|CPUs            |
|Dishwashers     |
|Fridge Freezers |
|Mobile Phones   |
+----------------+

In [11]:
# Clean and concatenate selected attributes to create text field to be used to create embeddings
def prepare_selected_product_attributes_for_embedding(product_df):

    product_metadata = ( 
        product_df[['text']]
         .to_dict(orient='index')
    )
    
    # Clean the data by dropping values having nan as a value
    product_metadata_as_list = [*product_metadata.values()]
    # Drop attributes with nan values
    clean_data = [{k: my_dict[k] for k in my_dict if not pd.isna(my_dict[k])} for my_dict in product_metadata_as_list]
    
    ids_and_text_as_tuple_list = [ row['text'] for row in clean_data]
    
    return ids_and_text_as_tuple_list


In [12]:
products_as_list = prepare_selected_product_attributes_for_embedding(deduped_data_df)

In [13]:
products_as_list

['apple iphone 8 plus 64gb silver Mobile Phones',
 'apple iphone 8 plus 64 gb spacegrau Mobile Phones',
 'apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim free smartphone in gold Mobile Phones',
 'apple iphone 8 plus 64gb space grey Mobile Phones',
 'apple iphone 8 plus gold 5.5 64gb 4g unlocked sim free Mobile Phones',
 'apple iphone 8 plus 64 gb space grey Mobile Phones',
 'apple iphone 8 plus 5.5 single sim 4g 64gb silver Mobile Phones',
 'sim free iphone 8 plus 64gb by apple space grey Mobile Phones',
 'apple iphone 8 plus 64gb gold smartphone Mobile Phones',
 'apple iphone 8 plus 5.5 single sim 4g 64gb grey Mobile Phones',
 'apple iphone 8 plus silver 5.5 64gb 4g unlocked sim free Mobile Phones',
 'apple iphone 8 plus 64 gb silver Mobile Phones',
 'apple iphone 8 plus 64gb silver unlocked Mobile Phones',
 'apple iphone 8 plus 14 cm 5.5 64 gb 12 mp ios 11 silver Mobile Phones',
 'iphone 8 plus sim free water dust resistant 64gb silver by apple Mobile Phones',
 'iphone 8 plus 64gb spa

In [14]:
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
import os

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_texts(
    texts=products_as_list,
    embedding=get_embedding_model('sentence-transformers/all-mpnet-base-v2'),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

  return self.fget.__get__(instance, owner)()


## Step 4: Define helper method to retrieve the right context (set of vectors) given a search query
This helper method works help us to retrieve the relevant vectors from the database given a user prompt. In this case, we use cosine-similarity to find the most relevant vectors to the user prompt. 

In [14]:
from langchain_core.documents import Document
from langchain_community.vectorstores import MongoDBAtlasVectorSearch

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGODB_ATLAS_CLUSTER_URI,
    DB_NAME + "." + COLLECTION_NAME,
    get_embedding_model('sentence-transformers/all-mpnet-base-v2'),
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

# Define helper method for Rag with AIConfig
def get_knn_context_from_query(query: str, k: int = 10) -> str:
    print(query)
    results: list[Document] = vector_search.similarity_search(
        query=query, k=k
    )
    #print(results)
    results_as_strings = [doc.page_content for doc in results]
    resulting_documents_as_a_string = "\n".join(results_as_strings)

    return resulting_documents_as_a_string

  return self.fget.__get__(instance, owner)()


## Step 5: Run prompts on the context retrieved from the database
We will the AIConfig JSON template to run prompts over the context retrieved from the database.

TODO: Replace the variables: user_question, style_guide, etc. with values/data relevant to your use case. 

In [15]:
model = get_embedding_model('sentence-transformers/all-mpnet-base-v2')
query_vector = model.embed_query("cool phone")
print(query_vector)

[-0.035676199942827225, -0.051950179040431976, -0.010245890356600285, 0.014635234139859676, 0.033233627676963806, 0.0214884914457798, -0.039112769067287445, -0.04421400651335716, -0.026802729815244675, 0.023767825216054916, -0.04850326478481293, 0.006135963834822178, 0.0011653304100036621, 0.045071717351675034, -0.0040868609212338924, 0.002218278357759118, -0.01888010837137699, 0.01601950079202652, 0.002179838716983795, 0.0025546210817992687, 0.029801109805703163, -0.0016484501538798213, 0.03372957929968834, -0.04730397090315819, -0.007309397216886282, -0.0017846199916675687, 0.023797014728188515, 0.017811112105846405, -0.03674238920211792, -0.06409444659948349, -0.0355306938290596, 0.0108564468100667, -0.048938386142253876, 0.060999542474746704, 1.67625944413885e-06, -0.021765120327472687, 0.02064700797200203, 0.006928539369255304, -0.03066718950867653, 0.014757233671844006, 0.037962187081575394, 0.04523419588804245, -0.045881595462560654, -0.0035568370949476957, -0.02351464331150055,

In [16]:
context = get_knn_context_from_query("cool phone")

cool phone


In [17]:
context

'rugged smartphone unlocked blackview bv9000pro 5.7 inch 18 9 fhd full display rugged dual sim phone unlocked with 6gb rom 128gb ram 8mp 13mp 5mp dual sony cameras 4180mah big battery outdo Mobile Phones\nrugged smartphone unlocked blackview bv9000pro 5.7 inch 18 9 fhd full display rugged dual sim phone unlocked with 6gb rom 128gb ram 8mp 13mp 5mp dual sony cameras 4180mah big battery outdo Mobile Phones\nnokia 150single sim mobile 6.1cm 2.4inches Mobile Phones\nnokia 150single sim mobile 6.1cm 2.4inches Mobile Phones\nnokia 7 plus 6 4g 4gb 64gb 3800mah black copper Mobile Phones\nnokia 7 plus 6 4g 4gb 64gb 3800mah black copper Mobile Phones\nsamsung galaxy young 2 Mobile Phones\nsamsung galaxy young 2 Mobile Phones\nbrand new kazam trooper x5.5 sim free smartphone black kazamtrooperx55 Mobile Phones\nsamsung galaxy trend plus Mobile Phones'

## Prompt Template for Llama-7b

In [18]:

template = '''
<s>[INST] <<SYS>>
You are an ecommerce search query analyzer and understands the intent from the query. You will receive user entered search query, few products matching the query and output is a list of categories with a level of confidence high, medium, and low, customer might be intersted in.
<s>
Customer Query: <<example_query>>
possible intents: <<intents>>
Products:
<<example_context>>
[\INST]
Customers intents: [{"intent": "Mobile Phones", "score": "high"}, {"intent": "Fridges", "score", "low"}]
<s>
[INST]
Customer Query: <<input_query>>
possible intents: <<intents>>
Products:
<<context>>
[\INST]
Customers intents:
'''


In [19]:
generative_model = get_generative_model("TheBloke/Llama-2-7B-Chat-GGML")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
example_query = 'cool iphone'
example_intents = '[{"intent": "Mobile Phones", "score": "high"}, {"intent": "Fridges", "score", "low"}]'
intents = '["Microwaves", "Fridges", "TVs", "Washing Machines", "Freezers", "Digital Cameras", "CPUs", "Dishwashers", "Fridge Freezers", "Mobile Phones"]'

input_query = 'silver phone'
prompts = []

example_products = '''
apple iphone 8 plus 64gb silver Mobile Phones
sim free iphone 8 plus 64gb by apple space grey Mobile Phones
iphone 8 plus sim free water dust resistant 64gb silver by apple Mobile Phones
siemens acool 213 x 60 x 61cm freezer icebox Freezers
'''

query_products = '''
apple iphone 8 plus 64gb silver Mobile Phones
apple iphone 8 plus 5.5 single sim 4g 64gb silver Mobile Phones
sim free iphone 8 plus 64gb by apple space grey Mobile Phones
samsung galaxy s8 artic silver 5.8 64gb 4g unlocked sim free Mobile Phones
smeg fab28qx1 50s style silver fridge Fridges
'''

example_products = get_knn_context_from_query(example_query)
query_products = get_knn_context_from_query(input_query)

prompt = template.replace('<<example_query>>', example_query)
prompt = template.replace('<<example_context>>', example_products)
prompt =  prompt.replace('<<intents>>', intents)
prompt =  prompt.replace('<<input_query>>', input_query)
prompt =  prompt.replace('<<context>>', query_products)

#print(prompt)

prompts.append(prompt)


llama2_response = generative_model.generate(prompts)
print(llama2_response)

cool iphone
silver phone
generations=[[Generation(text='[{"intent": "Mobile Phones", "score": "high"}, {"intent": "Fridges", "score": "low"}]\n<s>\nI understand that you are an ecommerce search query analyzer and can analyze user entered search queries to determine the intent behind them. Based on the customer query you provided, it seems that the customer is searching for mobile phones, specifically silver phone models. The possible intents associated with this query are "Mobile Phones"')]] llm_output=None run=[RunInfo(run_id=UUID('76908b43-c0b7-4347-9f76-e07f905a35e1'))]
