In [1]:
import fireworks.client
import os
import dotenv
import chromadb
import json
from tqdm.auto import tqdm
import pandas as pd
import random
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer

# you can set envs using Colab secrets
dotenv.load_dotenv()

fireworks.client.api_key = 'KTGKcoCndQttxHOjG4cYALmEXR0ByhYBgtrozJesElA5eJ2A'

In [7]:
def get_completion(prompt, model=None, max_tokens=50):

    fw_model_dir = "accounts/fireworks/models/"

    if model is None:
        model = fw_model_dir + "llama-v2-7b"
    else:
        model = fw_model_dir + model

    completion = fireworks.client.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0
    )

    return completion.choices[0].text

In [None]:
# load dataset from data/ folder to pandas dataframe
# dataset contains column names

df_amazon = pd.read_csv('reviews.csv')

# remove rows with empty titles or descriptions
df_comments=df_amazon.drop(['ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time'],axis=1)

In [None]:
# Process the first 2000 comments
df_comments_reduced=df_comments[:10000]

In [None]:
# convert dataframe to list of dicts with Id and Text columns only

amazon_comments_dict = df_comments_reduced.to_dict(orient="records")

##### We will be using SentenceTransformer (all-MiniLM-L6-v2) for generating embeddings that we will store to a chroma document store.

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        batch_embeddings = embedding_model.encode(input)
        return batch_embeddings.tolist()

embed_fn = MyEmbeddingFunction()

# Initialize the chromadb directory, and client.
client = chromadb.PersistentClient(path="./chromadb")

# create collection
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments"
)

##### Generate embeddings in batches:

In [None]:
# Generate embeddings, and index titles in batches
batch_size = 50

# loop through batches and generated + store embeddings
for i in tqdm(range(0, len(amazon_comments_dict), batch_size)):

    i_end = min(i + batch_size, len(amazon_comments_dict))
    batch = amazon_comments_dict[i : i + batch_size]

    # Replace title with "No Title" if empty string
    #batch_titles = [str(comment["Summary"]) if str(comment["Summary"]) != "" else "No Title" for comment in batch]
    batch_titles = [str(comment["Summary"]+"."+comment["Text"]) for comment in batch]
    batch_ids = [str(comment["Id"]) for comment in batch]

    # generate embeddings
    batch_embeddings = embedding_model.encode(batch_comments)

    # upsert to chromadb
    collection.upsert(
        ids=batch_ids,
        documents=batch_titles,
        embeddings=batch_embeddings.tolist(),
    )

In [None]:
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments",
    embedding_function=embed_fn
)

retriever_results = collection.query(
    query_texts=["dog"],
    n_results=5,
)

print(retriever_results)

##### Let's try with the all-mpnet-base-v2 model for encoding

In [2]:
embedding_model1 = SentenceTransformer('all-mpnet-base-v2')
class MyEmbeddingFunction1(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        batch_embeddings1 = embedding_model1.encode(input)
        return batch_embeddings1.tolist()

embed_fn = MyEmbeddingFunction1()

# Initialize the chromadb directory, and client.
client = chromadb.PersistentClient(path="./chromadb")

# create collection
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments2",
    embedding_function=embed_fn
)

In [None]:
# Generate embeddings, and index titles in batches
batch_size = 50

# loop through batches and generated + store embeddings
for i in tqdm(range(0, len(amazon_comments_dict), batch_size)):

    i_end = min(i + batch_size, len(amazon_comments_dict))
    batch = amazon_comments_dict[i : i + batch_size]

    # Replace title with "No Title" if empty string
    #batch_titles = [str(comment["Summary"]) if str(comment["Summary"]) != "" else "No Title" for comment in batch]
    batch_titles = [str(comment["Summary"]+"."+comment["Text"]) for comment in batch]
    batch_ids = [str(comment["Id"]) for comment in batch]

    # generate embeddings
    batch_embeddings1 = embedding_model1.encode(batch_titles)

    # upsert to chromadb
    collection.upsert(
        ids=batch_ids,
        documents=batch_titles,
        embeddings=batch_embeddings1.tolist(),
    )

In [5]:
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments2",
    embedding_function=embed_fn
)

retriever_results = collection.query(
    query_texts=["gluten"],
    n_results=5,
)

print(retriever_results)



In [None]:
query_text="coffee"
encoded_user_query= embedding_model1.encode(query_text)
user_query=encoded_user_query.tolist()

In [None]:
encoded_user_query.tolist()

In [None]:
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments1",
    embedding_function=embed_fn
)

In [None]:
retriever_results = collection.query(
    query_embeddings=user_query,
    n_results=5,
)

print(retriever_results)

In [9]:
collection = client.get_or_create_collection(
    name=f"amazon_ff_comments2",
    embedding_function=embed_fn
)

# user query
query_text="dog food"
encoded_user_query= embedding_model1.encode(query_text)
user_query=encoded_user_query.tolist()

retriever_results = collection.query(
    query_embeddings=user_query,
    n_results=5,
)

# concatenate titles into a single string
results = '\n'.join(retriever_results['documents'][0])

prompt_template = f'''[INST]

Generate a summary of comments

Topic: {query_text}
Results: {results}

Comments:

[/INST]
'''
mistral_llm = "mistral-7b-instruct-4k"
responses = get_completion(prompt_template, mistral_llm, max_tokens=10000)
comment_summary = ''.join([str(r) for r in responses])

# Print the suggestions.
print("Summarised comments:")
print(responses)
print("\n\n\nPrompt Template:")
print(prompt_template)

Summarised comments:

* The first commenter is happy with the new dog food they tried and wishes there were more coupons available for it.
* The second commenter tried the new dog food but found it to be unflavorful and not as easy to serve as they had hoped. They also criticized the lack of science behind the brand compared to higher-end brands like Royal Canine.
* The third commenter praises the new dog food as the only brand that their dog with allergies can eat.
* The fourth commenter mentions that the dogs on their Christmas list enjoyed the dog treats and ate them right away.
* The fifth commenter praises the new dog food as being healthy and good for digestion, as well as suitable for small puppies. They also mention that their dog eats their required amount at every feeding.



Prompt Template:
[INST]

Generate a summary of comments

Topic: dog food
Results: DOG FOOD.This was a new food for my dog, and he seems to have adjusted very well to this product. Thank you, and I wish t