In [1]:
import pandas as pd
import numpy as np
import os
import argparse

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate

from dataclasses import dataclass
import os
import shutil

In [3]:
CHROMA_PATH = "backend/chroma"
DATA_PATH = "backend/data"


def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[15]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")




In [4]:
generate_data_store()

Split 3 documents into 38 chunks.
If the movie title cannot be found or if the release date is incorrect the program will output “Movie Not Found. Make Sure Spelling And Release Date Are Correct”. The release date of the movie is needed because there are many movies with the same title, the release date helps ensure that the correct movie is found within the movie dictionary. Once the movie index is found the genres and original language of the movie are also found. Next the program begins the rules based approach of recommending a movie. All movies with the same genre as the input movie are found and stored in a list. Then the movies are filtered based on the release year in which movies with a similar release year are added to a second list. Next the movies are filtered on language in which only the movies with the same original language are added to a third list. Lastly the remaining movies are added to a dictionary and sorted by popularity. The movie with the highest popularity, gi

  warn_deprecated(


Saved 38 chunks to backend/chroma.


  warn_deprecated(
