In [90]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.retrievers.multi_query import MultiQueryRetriever


In [54]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [91]:
class SourceDocumentLoader:
    
    def __init__(self, filepath):
        self.filepath = filepath
        self.pages = None
        
    # Load the pdf
    def load_pdf(self):
        loader = PyPDFLoader(self.filepath)
        self.pages = loader.load_and_split()
        
        print(f"Loaded {len(self.pages)} pages!")
        
        self.modify_sources()
        
    # Modify the track sources
    def modify_sources(self):
        track_number = 0
        for index, page in enumerate(self.pages[7:]):
            if page.page_content.startswith("Track"):
                track_number += 1
                
            page.metadata["source"] = f"Track {track_number}"
            
    def get_pages(self):
        return self.pages

In [92]:
class TextSplitter:
    
    def __init__(self, documents):
        self.documents = documents
        self.splitter = self.get_splitter()
        
    def get_splitter(self):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
        )
        
        return text_splitter 
    
    def _get_splits(self):
        return self.splitter.split_documents(self.documents)

In [107]:
class VectorStore:
    
    def __init__(self, documents, vectorstore_path, embeddings):
        self.vector_store = FAISS
        self.db = None
        self.documents = documents
        self.vector_store_path = vectorstore_path
        self.embeddings = embeddings
        
        # Create the directory if it doesn't exist
        os.makedirs(self.vector_store_path, exist_ok=True)
    
    def get_indices(self):
        # First try to load the vector store from the vectorstore path else create and save it in that path
        if os.path.exists(self.vector_store_path):
            print("Loading vectorstore from folder!")
            self.db = FAISS.load_local(self.vector_store_path, self.embeddings)
        
        else:
            print("Creating new vectorstore!")
            self.db = FAISS.from_documents(documents, self.embeddings)
            self.db.save_local(self.vector_store_path)
        
        return self.db

In [125]:
class SpanishMaster:
    
    def __init__(
            self, 
            embeddings = OpenAIEmbeddings(),
            filepath:str = "assets/Complete+Spanish+transcript+-+2019+final.pdf",
            vectorstore_path:str = "assets/vectorstore",
            llm = ChatOpenAI(temperature=0)
        ):
        self.filepath = filepath
        self.vectorstore_path = vectorstore_path
        self.pages = None
        self.splits = None
        self.db = None
        self.retriever = None
        self.chain = None
        self.llm = llm
        self.embeddings = embeddings
        
        self._define_response_chain()
    
    def _load_document(self):
        # Loading Document
        document_loader = SourceDocumentLoader(filepath=filepath)
        document_loader.load_pdf()
        self.pages = document_loader.get_pages()
        
        print("Number of pages : ", len(self.pages))

    def _get_splits(self):
        # Splitting the document
        splitter = TextSplitter(documents=documents)
        self.splits = splitter._get_splits()

        print("Number of splits : ", len(self.splits))    
    
    def _get_vectorstore(self):
        # Creating vector Store
        vector_store = VectorStore(splits, self.vectorstore_path, self.embeddings)
        self.db = vector_store.get_indices()

    def _get_retriever(self):
        # self.retriever = self.db.as_retriever()
        
        self.retriever = MultiQueryRetriever.from_llm(
           retriever=self.db.as_retriever(), llm=self.llm
        )
    
    def _get_prompt(self):
        template = """
            Answer the Spanish language related question/questions in details, based ONLY on the following context:

            {context}

            The above context is from a "Language Transfer" course for learning Spanish. It is basically a sequence of interactions between a teacher and a student.
            Always supplement your answers with the source track/tracks number/numbers where the user can refer for more details if possible.

            Question: {question}
        """
        
        prompt = ChatPromptTemplate.from_template(template)
        
        return prompt
        
    def _define_response_chain(self):
        
        prompt = self._get_prompt()
        model = self.llm

        self.chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | model
            | StrOutputParser()
        )
        
    def setup_spanish_master(self):
        self._load_document()
        self._get_splits()
        self._get_vectorstore()
        self._get_retriever()
        
    def get_response(self, question):
        for chunk in self.chain.stream(question):
            print(chunk, end="", flush=True)

In [122]:
def format_docs(docs, separator="\n\n"):
    return separator.join([d.page_content for d in docs])

In [123]:
spanish_master = SpanishMaster()

In [119]:
spanish_master.setup_spanish_master()

Loaded 511 pages!
Number of pages :  511
Number of splits :  1009
Loading vectorstore from folder!


In [127]:
question = "Tell me the writing rules for spanish, with examples for each"
spanish_master.get_response(question)

The writing rules for Spanish are as follows:
1. If a word ends in a vowel, an n, or an s, the accent should be on the penultimate syllable (second last syllable). For example, "hablamos" and "importante" follow this rule.
2. If a word ends in any other consonant that is not n or s, the natural accent place is at the end of the word. For example, "comer" and "encontrar" follow this rule.
3. If a word breaks these rules, a written accent is used to show where the stress should be placed. For example, "tradición" breaks the rule and has a written accent on the last syllable.

(Source: Track 67)