In [5]:
%%capture
%pip install pandas langchain langchain-openai faiss-cpu langchain-community

In [6]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
print(openai_api_key)
if not openai_api_key:
    raise ValueError("❌ OPENAI_API_KEY is missing. Please check your .env file.")

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


In [7]:
class CompanyVectorDB:
    def __init__(self):
        self.base_path = os.path.abspath(os.path.join(os.getcwd(), "..", "Data"))
        self.vectorstore_path = os.path.join(self.base_path, "faiss_index")
        self.documents = []
        self.dataframes = {}
        

        #Error handling
        if os.path.exists(self.base_path):
            self.load_csvs(self.base_path)
        else:
            print(f"Data folder not found: {self.base_path}")

        self.vectorstore = self.load_or_create_vectorstore()


    def load_csvs(self, folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith(".csv"):
                path = os.path.join(folder_path, filename)
                try:
                    df = pd.read_csv(path)
                    df.columns = [col.strip().lower().replace(" ", "_").replace("-", "_") for col in df.columns]
                    self.dataframes[filename] = df

                    for _, row in df.iterrows():
                        metadata = row.to_dict()
                        content = "\n".join([f"{k}: {v}" for k, v in metadata.items()])
                        self.documents.append(Document(page_content=content, metadata=metadata))

                except Exception as e:
                    print(f"❌ Error reading {filename}: {e}")
    

    def load_or_create_vectorstore(self):
        embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

        if os.path.exists(self.vectorstore_path):
            print("📦 Loading cached vectorstore...")
            return FAISS.load_local(
                folder_path=self.vectorstore_path,
                embeddings=embedding_model,
                allow_dangerous_deserialization=True
            )
        else:
            print("⚙️ Creating new vectorstore...")
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
            docs = text_splitter.split_documents(self.documents)
            vs = FAISS.from_documents(docs, embedding_model)
            vs.save_local(self.vectorstore_path)
            return vs
        
    def search(self, query: str, k: int = 5):
        return self.vectorstore.similarity_search(query, k=k)

    def __repr__(self):
        return f"<CompanyVectorDB with {len(self.documents)} docs from {len(self.dataframes)} CSVs>"
                    

In [12]:
## Test the class
db = CompanyVectorDB()
retrieved_docs = db.vectorstore.similarity_search("Which movie is the oldest in the dataset?")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"\n🔎 Result #{i}")
    print("🎬 Title:", doc.metadata.get("title", "N/A"))
    print("🎬 Director:", doc.metadata.get("director", "N/A"))
    print("📅 Release Year:", doc.metadata.get("release_year", "N/A"))
    print("📍 Country:", doc.metadata.get("country", "N/A"))
    print("📝 Description:", doc.metadata.get("description", "N/A"))


📦 Loading cached vectorstore...

🔎 Result #1
🎬 Title: The 101-Year-Old Man Who Skipped Out on the Bill and Disappeared
🎬 Director: Felix Herngren, Måns Herngren
📅 Release Year: 2016
📍 Country: Sweden
📝 Description: In need of money, an eccentric ex-spy and his younger octogenarian partner embark on an international quest for a valuable Soviet soda formula.

🔎 Result #2
🎬 Title: The Old Guard
🎬 Director: Gina Prince-Bythewood
📅 Release Year: 2020
📍 Country: United States
📝 Description: Four undying warriors who've secretly protected humanity for centuries become targeted for their mysterious powers just as they discover a new immortal.

🔎 Result #3
🎬 Title: A Mission in an Old Movie
🎬 Director: Ahmad El-Badri
📅 Release Year: 2012
📍 Country: nan
📝 Description: A young man struggles with his overbearing mother while looking for romance and a way to kick-start his show business career.

🔎 Result #4
🎬 Title: Fifty Year Old Teenager
🎬 Director: Fouad Al Shatti
📅 Release Year: 1996
📍 Country: