In [43]:
import pandas as pd
import tiktoken
import os
import numpy as np
import together
import pymongo
from typing import List
import time
from tqdm import tqdm

from langchain_community.embeddings import HuggingFaceEmbeddings
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

TOGETHER_API_KEY = os.environ["TOGETHER_API_KEY"]
together.api_key = TOGETHER_API_KEY
client = pymongo.MongoClient("mongodb+srv://rohandeswal:KUBSmGKGPZlgHpX6@airlinedb.wf8fd.mongodb.net/?retryWrites=true&w=majority")


In [44]:
# df.columns[1:]
df = pd.read_csv('./data/Airline_review.csv')

def generate_text(row):
    columns_values = [f"{col}:{val}" for col, val in zip(row.index[1:], row.values[1:])]
    return '; '.join(columns_values)
# _df = pd.DataFrame()
# _df["text"] =  df.apply(generate_text, axis=1)
# _df["tokens"] = _df.text.apply(lambda x: len(encoding.encode(x)))

# print(knowledge_document[0])
# print(len(knowledge_document))

In [57]:
# @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
# def generate_embedding(input_texts: List[str], model_api_string: str) -> List[List[float]]:
#     together_client = together.Together()
#     outputs = together_client.embeddings.create(
#         input=input_texts,
#         model=model_api_string,
#     )
#     return [x.embedding for x in outputs.data]


embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

def generate_embedding(embedding_model, document_str):
    return embeddings.embed_documents([document_str])

# embedding_model_string = 'togethercomputer/m2-bert-80M-8K-retrieval'
# vector_database_field_name = 'embedding_together_m2-bert-8k-retrieval'
hugginface_vector_database_field_name = 'all-MiniLM-L6-v2'
NUM_DOC_LIMIT = df.shape[0]

knowledge_document =  list(df.apply(generate_text, axis=1))[:NUM_DOC_LIMIT]
sample_output = generate_embedding(embeddings, "This is a test. ")
print(f"Embedding size is: {str(len(sample_output[0]))}")

Embedding size is: 384


In [55]:
db = client.airlineDB
embedding_df = pd.DataFrame()

for doc_str in tqdm(knowledge_document,desc = f"Generating Embeddings and Inserting into DB with NUM_DOC_LIMIT = {NUM_DOC_LIMIT}"):
    doc = {}
    doc[hugginface_vector_database_field_name] = generate_embedding(embeddings, doc_str)
    doc["value"] = doc_str

    df_temp = pd.DataFrame.from_dict(doc)
    embedding_df = pd.concat([embedding_df, df_temp], ignore_index=True)

    # db.collection.insert_one(doc)
# print(embedding_df)

Generating Embeddings and Inserting into DB with NUM_DOC_LIMIT = 23171: 100%|████████████████████████████████| 23171/23171 [37:26<00:00, 10.32it/s]


In [None]:
# embedding_df.rename(columns = {vector_database_field_name:hugginface_vector_database_field_name}, inplace = True)
# db.collection.insert_many(embedding_df.to_dict('records'))
# embedding_df.to_csv('./embedding_data.csv', sep='\t', encoding='utf-8', index = False)


In [79]:
query = "What are the top three airlines with the highest number of positive reviews?"
query_result = generate_embedding(embeddings, query)[0]
# print(query_result)
results = db.collection.aggregate([
    {
        "$vectorSearch":{
            "queryVector": query_result,
            "path": hugginface_vector_database_field_name,
            "numCandidates": NUM_DOC_LIMIT/10,
            "limit": 3,
            "index": "SemanticSearch",
        }
    }
])

results_as_dict = {doc['value']: doc for doc in results}
# print(list(results))
print(f"Query: {query}")
print("\n".join([str(i+1) + ". " + name for (i,name) in enumerate(results_as_dict.keys())]))

Query: What are the top three airlines with the highest number of positive reviews?
1. Airline Name:El Al Israel Airlines; Overall_Rating:9; Review_Title:"Service was excellent"; Review Date:15th October 2018; Verified:True; Review: We just flew business, seats 2C&D on one of the new Boeing 787-9 Dreamliners and were extremely happy. Service was excellent. Lie flat seats excellent. VOD although not as good as Singapore Airlines nonetheless was very good. We've flown Singapore Airlines many times and El Al much to their credit has upped their game substantially and is on their way towards being a top 50 or even top 20 airlines! Food was delicious. El Al has ordered a total of 18 Dreamliners and as these become available the reviews will soar. The last time we flew on El Al was 10 years ago and at that time it was awful. Great to see it's turned it around big time! Boarding at JFK was unorganized and a joke. Boarding at Ben Gurion airport was smooth!; Aircraft:Boeing 787-9; Type Of Trave