In [1]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25ldone
[?25h  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=9dc08ff091d950d9ef9b20eff58b3755d0168c5089a8d3771669647b1a5af146
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
%%time
! pip install  -U -qq langchain tiktoken pypdf faiss-gpu
! pip install  -U -qq InstructorEmbedding sentence_transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.1 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0

In [3]:
import re
import os
import torch
import random
import numpy as np
import pandas as pd
from operator import itemgetter
import langchain
from langchain.schema import format_document
from langchain.schema.messages import get_buffer_string
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough,RunnableParallel
from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from tqdm.autonotebook import trange


In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=42)

import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [15]:
class ChatBot:
    
    def __init__(self,
                 model,
                 tokenizer,
                 embeddings,
                 pdf_path = None,
                 chat_history = [],
                 max_len = 1000,
                 temperature = 0,
                 top_p = 0.95,
                 repetition_penalty = 1.15,
                 split_chunk_size = 800,
                 split_overlap = 0,
                 k = 3,
                 device = "cuda",
                 do_sample = True,
                 ## vector db
                 vector_db_save_path = "faiss_index_hp",
                 vector_db_load_path = None,
                 ## templates
                 answer_template = None,
                 condensed_question_template = None,
                 pipe = None,
                ):
        self.model = model
        self.tokenizer = tokenizer
        self.embeddings = embeddings
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        self.pdf_path = pdf_path
        self.chat_history = chat_history
        self.max_len = max_len
        self.temperature = temperature
        self.top_p = top_p
        self.repetition_penalty = repetition_penalty
        self.split_chunk_size = split_chunk_size
        self.split_overlap = split_overlap
        self.k = k
        self.device = device
        self.do_sample = do_sample
        self.vector_db_save_path = vector_db_save_path
        self.vector_db_load_path = vector_db_load_path
        
        
        # templates
        if answer_template is None:
            self.answer_template  = """Answer the question based only on the following context:
                                        {context}

                                        Question: {question}
                                    """
        self.answer_prompt = ChatPromptTemplate.from_template(self.answer_template)
        
        if condensed_question_template is None:
            self.condensed_question_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

                                                Chat History:
                                                {chat_history}
                                                Follow Up Input: {question}
                                                Standalone question:
                                                """
        
        self.condensed_question_prompt = PromptTemplate.from_template(self.condensed_question_template)
        
        ## llm pipeline 
        if pipe is None:
            pipe = pipeline(
                task = "text-generation",
                model = self.model,
                tokenizer = self.tokenizer,
                pad_token_id = self.tokenizer.eos_token_id,
                max_length = self.max_len,
                repetition_penalty = self.repetition_penalty
            )

        self.llm = HuggingFacePipeline(pipeline = pipe)
        
    def load_pdf(self):
        loader = DirectoryLoader(
            self.pdf_path,
            glob="./*.pdf",
            loader_cls=PyPDFLoader,
            show_progress=True,
            use_multithreading=True
        )

        self.documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = self.split_chunk_size,
            chunk_overlap = self.split_overlap
        )

        self.texts = text_splitter.split_documents(self.documents)        
        
    
    def create_vector_db(self):
        self.vectordb = FAISS.from_documents(
            documents = self.texts, 
            embedding = self.embeddings
        )

        self.vectordb.save_local(self.vector_db_save_path)
            
    def load_vector_db(self):
        self.vectordb = FAISS.load_local(
            self.vector_db_load_path,
            self.embeddings
        )
        
    def create_chat_qa_chain(self):
        
        if self.vector_db_load_path:
            self.load_vector_db()
        else:
            if self.pdf_path is not None:
                self.load_pdf()
                self.create_vector_db()
                self.vector_db_load_path = self.vector_db_save_path
                self.load_vector_db()
        
        retriever = None
        if hasattr(self,"vectordb"):
            retriever = self.vectordb.as_retriever(search_kwargs = {"k": self.k, "search_type" : "similarity"})
        
        _inputs = RunnableParallel(
                    standalone_question=RunnablePassthrough.assign(
                        chat_history=lambda x: get_buffer_string(x["chat_history"])
                    )
                    | self.condensed_question_prompt
                    | self.llm
                    | StrOutputParser(),
                )
        if retriever is None:
            _context = {
                "context": itemgetter("standalone_question"),
                "question": lambda x: x["standalone_question"],
            }
        else:
            _context = {
                "context": itemgetter("standalone_question") | retriever,
                "question": lambda x: x["standalone_question"],
            }
        
        self.chain = (
                _inputs
                | _context
                | self.answer_prompt
                | self.llm
                |StrOutputParser()
        )
    
    def clean_response(self,response):
        return response.replace("Answer:","").strip()
    
    def start_chat(self):
        self.create_chat_qa_chain()
        
        while True:
            user_input = input("You:")
            if user_input.lower().strip() == "bye":
                break
            model_input = {
                "question": user_input,
                "chat_history": self.chat_history,
            }
            response = self.chain.invoke(model_input)
            print("AI:",self.clean_response(response))
            #giving context of only last two responses
            self.chat_history = self.chat_history[-2:]
            self.chat_history.extend(
                [
                    HumanMessage(content=user_input),
                    AIMessage(content=self.clean_response(response))
                ]
            )
            print()
            
    def save(self, save_directory):
        self.model.save_pretrained(save_directory)
        self.tokenizer.save_pretrained(save_directory)

        attributes_to_save = {
            "pdf_path": self.pdf_path,
            "chat_history": self.chat_history,
        }

        with open(os.path.join(save_directory, "attributes.json"), "w") as f:
            json.dump(attributes_to_save, f)

    def load(self, load_directory):
        self.model = AutoModelForCausalLM.from_pretrained(load_directory)
        self.tokenizer = AutoTokenizer.from_pretrained(load_directory)

        with open(os.path.join(load_directory, "attributes.json"), "r") as f:
            loaded_attributes = json.load(f)

        self.pdf_path = loaded_attributes.get("pdf_path", None)
        self.chat_history = loaded_attributes.get("chat_history", [])
    
    def process_user_input(self, user_input):
        model_input = {
            "question": user_input,
            "chat_history": self.chat_history,
        }
        response = self.chain.invoke(model_input)
        return self.clean_response(response)
    
            

In [16]:
config = {
    "seed":42,
    "model_path":"mistralai/Mistral-7B-Instruct-v0.1",
    "embeddings_model_path":"sentence-transformers/all-MiniLM-L6-v2",
    "pdf_path":"/kaggle/input/pdfsubmission",
    "vector_db_save_path":"faiss_index_hp",
    "vector_db_load_path":"faiss_index_hp",
    "device":"cuda"
}

In [7]:
model = AutoModelForCausalLM.from_pretrained(config["model_path"],
                                            device_map='auto',
                                            torch_dtype=torch.float16,
                                            low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(config["model_path"])
embeddings = HuggingFaceInstructEmbeddings(model_name = config["embeddings_model_path"],
                                           model_kwargs = {"device":config["device"]})

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [17]:
chat_bot = ChatBot(model,
                   tokenizer,
                   embeddings,
                   pdf_path=config["pdf_path"],
                  )

In [9]:
import json
chat_bot.save("/kaggle/working/save")

In [28]:
chat_bot.start_chat()

100%|██████████| 4/4 [01:01<00:00, 15.40s/it]


You: financial disclosure document of bigb coffee


AI: The financial disclosure document for BigB Coffee is included in the Franchise Disclosure Document (FDD). Specifically, items 21 and 20 in the FDD include financial statements that can help determine the stability, growth, and competition in the franchise system. Additionally, item 12 describes the territory provisions in the franchise agreement that outline whether the franchisor and other franchisees can compete with each other.



You: Wahlburgers Franchising LLC


AI: The financial disclosure requirements for Wahlburgers Franchising LLC include providing Item 19 which gives information about outlet sales, costs, profits or losses. Additionally, it is recommended to obtain this information from others, such as current and former franchisees whose names and contact information can be found in Item 20 or Exhibit E.



You: For Bloomin Blinds


AI: The financial disclosure requirements for Blooming Blinds include providing financial statements (Exhibit E) and a list of all assets owned by the company (Exhibit A).



You: in detail


AI: The specific financial disclosure requirements for Blooming Blinds can be found in Exhibit A, which lists all assets of the debtor, both real and personal, tangible and intangible, existing or hereafter acquired. Additionally, financial statements can be found in Exhibit E.



You: whats tangible and intangible assets??


AI: Tangible assets include physical items such as vehicles, computer and office equipment, furniture and fixtures, and leasehold improvements. Intangible assets include non-physical items such as intellectual property, supporting obligations, contract rights, insurance claims, tort claims, and general intangibles.



You: okay any speciality about financial disclosure of amazing athelets, any special difference




AI: No, there are no specific financial disclosure requirements for professional athletes.



You: i am asking for Amazing Athletes Franchise Systems, LLC




AI: The financial disclosure requirement for Amazing Athletes Franchise Systems, LLC is that they must provide audited financial statements to the prospective franchisee 14 calendar days before signing a binding agreement with or making a payment to the franchisor or an affiliate in connection with the proposed franchise sale. Additionally, they must provide these financial statements as Exhibit E attached to the disclosure document.



You: now describe in detail about atomic wings




AI: Atomic Wings is a restaurant chain that has franchises in various locations. The document provided contains an agreement between Atomic Wings Franchisor Inc. and an area representative for one of their franchises.



You: in detail




AI: Based on the provided documents, Atomic Wings Franchisor Inc.'s agreement with an area representative for one of their franchises contains information about the franchisor's legal history, financial stability, growth potential, competition from other franchisees, and any restrictions on the territory they can operate in.



You: what about the THE MICHIGAN FRANCHISE INVESTMENT LAW




AI: According to the information provided in the PDF document, Michigan requires that Atomic Wings Franchisor Inc. provide a disclosure document to the area representative at least 10 business days before the execution of any binding franchise or other agreement or the payment of any consideration, whichever occurs first. If the disclosure document is not delivered on time or contains false or misleading statements or material omissions, a violation of federal and state law may have occurred and should be reported to the Federal Trade Commission and the state authority listed on Exhibit E. Additionally, the franchise agreement must be retained by the area representative, along with any other materials related to operating the franchised business, but no copies or records of these materials can be kept beyond what is necessary for compliance with any provision of law.



You: in a difference of all of the companies can you list out the differences in detail of their financial disclosure




AI: Atomic Wings Franchisor Inc. provides financial disclosures related to their actual or potential financial performance of their franchised and/or franchisor-owned outlets. They also provide an audit report which provides a reasonable basis for their opinion on the accuracy of their financial statements. The Burger Joint and The Pizza Place do not provide such detailed financial disclosures.



You: okay thank you so much'




AI: Atomic Wings Franchisor Inc. provides financial disclosures related to their actual or potential financial performance of their franchised and/or franchisor-owned outlets. They also provide an audit report that presents fairly, in all material respects, the financial position of Atomic Wings Franchisor Inc. as of December 31, 2018, and their ability to provide support to their business.
                                        
                                        On the other hand, The Burger Joint and The Pizza Place do not provide any financial disclosures related to their actual or potential financial performance of their franchised and/or franchisor-owned outlets. However, they both provide information about the stability, growth, and competition in their respective franchise systems.



You: bye


In [12]:
chat_bot.start_chat()

100%|██████████| 3/3 [00:05<00:00,  1.88s/it]


You: rcession on world and indian economy


AI: Based on the provided documents, it seems that there is concern about a potential recession affecting not only the United States but also the global economy. Additionally, India's economy is facing challenges such as rising unemployment despite recent growth. However, it is important to note that the analysis presented in the documents may not necessarily reflect the current state of the world and Indian economy.



You: fastest growing economic nation


AI: Some of the fastest-growing economies in the world include India, China, Germany, USA, Japan, and others. According to a report by the World Bank, India's share in world output is projected to jump from 5% as of today to 20.8% by 2040.



You: what about inflation


AI: The text does not provide information about the current inflation rate in India.



You: of the world


AI: Some of the fastest-growing economies in the world include India, China, Germany, USA, Japan, and others. According to a report by the World Bank, India's share in world output is projected to jump from 5% as of today to 20.8% by 2040.



You: inflation around the world


AI: The inflation rate for goods purchased online in the US slowed sharply last month while food prices increased at the fastest pace on record, according to the Adobe Digital Price Index. Meanwhile, European Commission forecasts inflation at 7.6% in 2022 and 2023 with a peak in Q3 2022 and a drop below 3% in Q4 2023. The German economy had strong H1 despite the Ukraine war and Russian gas deliveries uncertainty clouding the H2 outlook. Turkish industrial production was estimated at 0.3% mm and 80% yy in May. Swiss producer and import prices were estimated at 69% mm and 69% yy in June. The UK inflation came in hot, surging to a 40-year high of 94%.



You: some developments in big mncs




AI: Big multinational corporations are collaborating with other companies, making changes to their internal structures and processes, expanding into new markets, and investing heavily in technology and innovation. Some examples include Akzo Nobel's decision to prioritize home improvements, Barclays' aim to increase its presence in Germany, and Samsung's planned $200 billion investment in Austin. Additionally, there has been a shift towards sustainable practices and green initiatives among many companies, such as the Japanese company Loral Japan's collaboration with its Green Science Project.



You: okay thank you




AI: Big multinational corporations are collaborating with other companies, making changes to their internal structures and processes, expanding into new markets, and investing heavily in technology and innovation. Some examples include Akzo Nobel's decision to prioritize home improvements, Barclays' aim to increase its presence in Germany, and Samsung's planned $200 billion investment in Austin. Additionally, there has been a shift towards sustainable practices and green initiatives among many companies, such as the Japanese company Loral Japan's collaboration with its Green Science Project.



You: bye
