In [None]:
import pandas as pd
import numpy as np
import chromadb
from chromadb.config import Settings
import ollama
import os
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:

directory = '../data/vector_store'
os.makedirs(directory, exist_ok=True)

client = chromadb.Client(
    Settings(
        persist_directory=directory,
        anonymized_telemetry=False,
        is_persistent=True)
)

collection = client.get_or_create_collection(
    name="airline_reviews"
)

In [3]:
# df_path = "../data/airlines_reviews.csv"
# text_column = "Reviews"
def create_chunks(df_path, text_column):
    print("------ Creating chunks ------")
    all_chunks = []
    chunks = []
    df = pd.read_csv(df_path)
    
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame. Available columns: {list(df.columns)}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 100)
    # reviews = df[text_column]
    # print(reviews)
    for _, row in df.iterrows():
        review = row[text_column]
        airline = row['Airline']
        chunks = text_splitter.split_text(review)
        for chunk in chunks:
            all_chunks.append({
                "text":chunk,
                "airline":airline
            })
    print(f"Total chunks: {len(all_chunks)}")
    all_chunks = all_chunks[:50]
    return all_chunks

In [4]:
all_chunks = create_chunks('../data/airlines_reviews.csv','Reviews')
all_chunks

------ Creating chunks ------
Total chunks: 18749


[{'text': 'Flight was amazing. The crew onboard this flight were very welcoming, and gave a good atmosphere. The crew serving my aisle goes by the initial “G”. She was very kind & helpful. Gave my mom a bday cake for a late celebration even though it was just a 1hr 45min flight. Seat is well sanitized, legroom is spacious. IFE onboard has many variety of shows, music, etc. Bathroom always kept clean by crew at all times. & Food was delicious, overall this flight is a 9/10',
  'airline': 'Singapore Airlines'},
 {'text': 'Booking an emergency exit seat still meant huge discomfort in a seat far too narrow and poor padding meaning back ache in 90 minute flight. The seats on this aircraft are dreadful. The headphones and sound on the entertainment system was dreadful.',
  'airline': 'Singapore Airlines'},
 {'text': 'Excellent performance on all fronts. I would definitely choose to use this airline again. The aircraft is well-maintained and the staff well-trained on hospitality. Food was ple

In [5]:
def store_chunks(all_chunks, collection):
    for i, chunk in enumerate(all_chunks):
        try:
            embedding = ollama.embeddings(
                model="mxbai-embed-large",
                prompt=chunk["text"]
            )["embedding"]

            collection.add(
                ids=[str(i)],
                documents=[chunk["text"]],
                metadatas=[{"airline": chunk["airline"]}],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"Error embedding chunk {i}: {e}")

    print("All chunks stored. Total vectors:", collection.count())
    print(collection.count())

In [6]:
store_chunks(all_chunks,collection)

All chunks stored. Total vectors: 50
50


In [7]:
# def get_embeddings(text_list) -> np.ndarray:
#     print("------ Creating Embeddings ------")
#     embeddings = []
#     texts = [c["text"] for c in text_list]
#     for text in texts:
#         response = ollama.embed("mxbai-embed-large", input=text)
#         embeddings.append(response["embeddings"][0])
#     return np.array(embeddings)


In [8]:
# embeddings = get_embeddings(all_chunks)
# embeddings # Should be (num_chunks, embedding_dim)

In [9]:
def search(query, threshold = 0.8):
    default_message = "The query provided is out of my context of my knowledge base. I don't have enough information to answer this."

    try:
        print("------ Generating Response ------") 
        query_embedding = ollama.embeddings(model="mxbai-embed-large", prompt=query)["embedding"]
        
    except Exception as e:
        print(f"Errpr embedding query: {e}")
        
    results = collection.query(
    query_embeddings=[query_embedding],
    )
    responses = [
        {
            "text": results["documents"][0][i],
            "airline": results["metadatas"][0][i]["airline"],
            "score": results["distances"][0][i]
        } 
        for i in range(len(results["documents"][0]))
    ]
    
    best_response = min(responses, key=lambda x: x["score"])
    
    if best_response["score"] > threshold:
        return default_message
    
    return best_response["text"]
    # return threshold
    # return f" Answer:{all_chunks[best_index]['text']} \n Airline: {all_chunks[best_index]['airline']} \n Score: {float(cos[best_index])} " 

In [10]:

def preprocess_query(text: str) -> str:
    text = re.sub(r"<[^>]+>", "", text)             # Remove HTML/XML tags
    text = re.sub(r"[\x00-\x1f\x7f]", "", text)     # Remove control chars
    text = re.sub(r"\s+", " ", text)                # Normalize whitespace
    return text.strip()

In [11]:
search("What about the type of food being provided during the flight?")

------ Generating Response ------


'Pretty comfortable flight considering I was flying in economy class in one of the older aircraft in their fleet which is still kept in relatively good condition. The economy class cabin was in a 3-3-3 configuration and the flight was almost full. What amazed me most was how much food was available for economy passengers - 2 full meals and the rear galley was stocked with snacks. The crew came round regularly with trays of drinks and juice through the flight. I disembarked with a very full'

In [12]:
search("How much horsepower does the car make?" )

------ Generating Response ------


"The query provided is out of my context of my knowledge base. I don't have enough information to answer this."