In [15]:
import pandas as pd
import numpy as np
import ollama
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [77]:

def create_chunks(df_path: str, text_column: str) -> List[str]:
    """Takes a DataFrane path and the column name with text. Returns a list of first 50 chunks.

    Args:
        df_path (str): path of the DataFrame
        text_column (str): Name of column with text

    Returns:
        List[str]: chunks
    """
    chunk_size = 250
    chunk_overlap = 50
    all_chunks = []
    df = pd.read_csv(df_path)
    
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame. Available columns: {list(df.columns)}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap)
    reviews = df[text_column]
    # print(reviews)
    for review in reviews:
        all_chunks.extend(text_splitter.split_text(review))
    print(f"Total chunks: {len(all_chunks)}")
    all_chunks = all_chunks[:500]
    return all_chunks


In [78]:
all_chunks = create_chunks('../data/airlines_reviews.csv','Reviews')
all_chunks

Total chunks: 35442


['Flight was amazing. The crew onboard this flight were very welcoming, and gave a good atmosphere. The crew serving my aisle goes by the initial “G”. She was very kind & helpful. Gave my mom a bday cake for a late celebration even though it was just',
 'for a late celebration even though it was just a 1hr 45min flight. Seat is well sanitized, legroom is spacious. IFE onboard has many variety of shows, music, etc. Bathroom always kept clean by crew at all times. & Food was delicious, overall this',
 'at all times. & Food was delicious, overall this flight is a 9/10',
 'Booking an emergency exit seat still meant huge discomfort in a seat far too narrow and poor padding meaning back ache in 90 minute flight. The seats on this aircraft are dreadful. The headphones and sound on the entertainment system was dreadful.',
 'Excellent performance on all fronts. I would definitely choose to use this airline again. The aircraft is well-maintained and the staff well-trained on hospitality. Food wa

In [107]:
def get_embeddings(text_list: List[str]) -> np.ndarray:
    """Generated embeddings for a list of strings and returns a numpy array of embeddings 

    Args:
        text_list (List[str]): chunks received after chunking

    Returns:
        np.array: an array of embeddings of the chunks
    """
    embeddings = []
    for text in text_list:
        response = ollama.embed(model="mxbai-embed-large", input=text)
        embeddings.append(response["embeddings"][0])
    return np.array(embeddings)

In [108]:
embeddings = get_embeddings(all_chunks)
embeddings # Should be (num_chunks, embedding_dim)

array([[ 0.03699813, -0.0028217 , -0.02952673, ..., -0.03115136,
         0.03033338, -0.04419744],
       [ 0.0214511 ,  0.0107396 , -0.01453557, ...,  0.00156704,
         0.0240572 , -0.0164731 ],
       [ 0.04968005,  0.02700745, -0.0034137 , ..., -0.00049105,
         0.05040184, -0.01111611],
       ...,
       [ 0.0409267 , -0.01466971, -0.00882639, ..., -0.01944758,
         0.0464426 , -0.03657843],
       [ 0.0340926 ,  0.02339297,  0.02171471, ..., -0.01902878,
         0.05332448, -0.02875262],
       [ 0.04833071,  0.03367324, -0.02373235, ..., -0.04052476,
         0.03385491, -0.0168135 ]], shape=(500, 1024))

In [109]:
def search(query:str) -> str:
    """Takes query, embeds it and performs cosine similarity search and returns a response from the data source

    Args:
        query (str):Input query.

    Returns:
        str: Best match from the source.
    """
    if embeddings is None:
        print("No stored embeddings found!")
    try:
        
        # prompt = "Represent this sentence for searching relevant passages: "
        # query_embedding = get_embeddings(query) 
        query_embedding = ollama.embeddings(model="mxbai-embed-large", prompt=query)["embedding"]
        # print((query_embedding))
    except Exception as e:
        print(f"Errpr embedding query: {e}")

    cos = cosine_similarity(
        [query_embedding],
        embeddings
        )[0]
    best_index = np.argmax(cos)
    return all_chunks[best_index]

search("How was the leg room?")

'when needed. Food was good although had the option to Book the Cook prior to flight. I know some airlines are starting to increase leg room to 40 inches SIN (38) which would be great when you consider the extra money that is paid. On our return from'

In [70]:
print(embeddings.shape)
print(len(all_chunks))

(50, 1024)
500
