In [4]:
import os
from langchain_text_splitters.character import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_chroma import Chroma
import transformers 
from transformers import AutoProcessor, AutoModel, AutoTokenizer
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS, Chroma
# from langchain_chroma import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.llms import GPT4All
# from gpt4all import GPT4All
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableMap
import torch
from typing import Union
import pathlib
from path import *

In [5]:
def read_txt(filepath):
    with open(filepath, 'rb') as f:
        content = f.read().decode("utf-8")
    return content

In [6]:
device_name = 'NVIDIA GeForce RTX 3050 Laptop GPU'

In [13]:
novel_name = "The Wild Duck.txt"
# models_path = "C:\\Users\\SOUMEN\\Projects\\Python\\chat-with\\models"
instruction_model_path = "C:\\Users\\SOUMEN\\Projects\\Python\\chat-with-novel\\models\\tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
# instruction_model_path = "models\\tinyllama-1.1b-chat-v1.0.Q8_0.gguf"

In [14]:
class Novel:
    """
    Python Class that reads a novel, creates a vector db of that novel and 
    provides an option to query through the novel
    """
    
    def __init__(self, novels_dir: Union[str, pathlib.Path], 
                 novel_name: str, 
                 instruction_model_path: Union[str, pathlib.Path],
                 embedding_model_name: str="all-MiniLM-L6-v2.gguf2.f16.gguf",
                 device: str='NVIDIA GeForce RTX 3050 Laptop GPU'):
        """
        Constructor

        Args:
            novels_dir: Path to the directory which contains the novel.
            novel_name: Name of the novel
            embedding_model_name: Name of GPT4All or custom model that will be used for embedding. Including ".gguf" file extension is optional but encouraged.
            instruction_model_path: Path to the GPT4ALL or custom model that will be used for querying.
            device: The processing unit on which the embedding model will run. 
                    It can be cpu, gpu or gpu device name. To know which gpu devices are available, call `GPT4All.list_gpus()` to check.
        """
        
        self.novels_dir = novels_dir
        self.name = novel_name
        self.embedding_model_name = embedding_model_name
        self.instruction_model_path = instruction_model_path
        self.device = device
        
        self.instruction_model = None
        
        
        self.novel_path = os.path.join(self.novels_dir, self.name)
        
        self.loader = TextLoader(self.novel_path)
        self.documents = self.loader.load()
        # self.text_splitter = RecursiveCharacterTextSplitter(
        #     # Set a really small chunk size, just to show.
        #     chunk_size=1500,
        #     chunk_overlap=200,
        #     length_function=len,
        #     is_separator_regex=False,
        # )
        self.text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
        
        self.docs = self.text_splitter.split_documents(self.documents)
        
        self.novel_db = self.create_novel_db()
    
        self.template = """Answer the question based only on the following context:
        {context}

        Question: {question}
        """
        self.prompt = ChatPromptTemplate.from_template(self.template)
        self.output_parser = StrOutputParser()
        
    def create_novel_db(self):
        
        embeddings = GPT4AllEmbeddings(
                model_name=self.embedding_model_name,
                gpt4all_kwargs={'allow_download': 'True'}
            )
        novel_db = Chroma.from_documents(self.docs, embeddings)
        
        return novel_db
    
    def query(self, query):
        
        if self.instruction_model == None:
            self.instruction_model = GPT4All(model=self.instruction_model_path, device=self.device)
            
        retriever = self.novel_db.as_retriever(search_kwargs={"k": 2})
        chain = RunnableMap({
            "context": lambda x: retriever.get_relevant_documents(x["question"]),
            "question": lambda x: x["question"]
        }) | self.prompt | self.instruction_model | self.output_parser
        
        result = chain.invoke({"question": query})
        
        return result

In [15]:
# create the novel object
novel = Novel(NOVELS_DIR, novel_name, instruction_model_path)

#### Ask a question

In [16]:
novel.query("What does the duck eat?")

  warn_deprecated(


'Answer: The wild duck eats fish and other aquatic animals, such as frogs, turtles, and snakes.'