# RAG GenAI demo using Mistral 7b (local LLM)
This requires a fairly hefty instance backing the notebook, >40GB RAM at least, if using CPU.
We used an ml.m5.4xlarge (no GPU) for the purposes of this demo, which is slow, but functional.

### Tutorial Outline
0. `pip install` and import relevant dependencies.
1. Download a public-facing pdf 
2. Split the document into chunks
3. Index the embeddings with FAISS and `all-MiniLM-l6-v2`.
4. Initialize our question, prompt and context (from FAISS similarity search)
5. Load the LLM (mistral 7b instruct)
6. Generate the output of the prompt and display

# 0. `pip install` and import relevant dependencies

In [2]:
pip install faiss-cpu PyPDF2 langchain sentence-transformers transformers accelerate bitsandbytes

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
access_token = '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'


In [4]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from IPython.display import display, Markdown

In [5]:

#import torch
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.agents import create_sql_agent 
from langchain.agents.agent_toolkits import SQLDatabaseToolkit 
from langchain.sql_database import SQLDatabase 
from langchain.agents import AgentExecutor 
from langchain.agents.agent_types import AgentType
from time import time

In [6]:
custom_table_info = {
    "artists": """CREATE TABLE artists (
        artist_id integer NOT NULL,
        name character varying(200),
        nationality character varying(50),
        gender character varying(25),
        birth_year integer,
        death_year integer,
        CONSTRAINT artists_pk PRIMARY KEY (artist_id))

/*
3 rows from artists table:
"artist_id"	"name"	"nationality"	"gender"	"birth_year"	"death_year"
12	"Jüri Arrak"	"Estonian"	"Male"	1936	
19	"Richard Artschwager"	"American"	"Male"	1923	2013
22	"Isidora Aschheim"	"Israeli"	"Female"		
*/""",
    "artworks": """CREATE TABLE artworks (
        artwork_id integer NOT NULL,
        title character varying(500),
        artist_id integer NOT NULL,
        name character varying(500),
        date integer,
        medium character varying(250),
        dimensions text,
        acquisition_date text,
        credit text,
        catalogue character varying(250),
        department character varying(250),
        classification character varying(250),
        object_number text,
        diameter_cm text,
        circumference_cm text,
        height_cm text,
        length_cm text,
        width_cm text,
        depth_cm text,
        weight_kg text,
        durations integer,
        CONSTRAINT artworks_pk PRIMARY KEY (artwork_id))

/*
3 rows from artworks table:
"artwork_id"	"title"	"artist_id"	"name"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
102312	"Watching the Game"	2422	"John Gutmann"	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
103321	"Untitled (page from Sump)"	25520	"Jerome Neuner"	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056	"Bernard Tschumi"		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
*/""",
}

In [7]:
sqlite_path = "../moma.db"
sqlite_uri = f"sqlite:///{sqlite_path}"
db_sql = SQLDatabase.from_uri(sqlite_uri,custom_table_info = custom_table_info)

# 1. Download a public-facing pdf

In [8]:
%%sh
wget -O fannie-mf-commentary-oct-2023.pdf https://www.fanniemae.com/media/49331/display

--2024-04-02 13:02:18--  https://www.fanniemae.com/media/49331/display
Resolving www.fanniemae.com (www.fanniemae.com)... 104.18.27.25, 104.18.26.25, 2606:4700::6812:1b19, ...
Connecting to www.fanniemae.com (www.fanniemae.com)|104.18.27.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 249442 (244K) [application/pdf]
Saving to: ‘fannie-mf-commentary-oct-2023.pdf’

     0K .......... .......... .......... .......... .......... 20% 28.9M 0s
    50K .......... .......... .......... .......... .......... 41% 68.8M 0s
   100K .......... .......... .......... .......... .......... 61% 49.0M 0s
   150K .......... .......... .......... .......... .......... 82%  105M 0s
   200K .......... .......... .......... .......... ...       100%  121M=0.004s

2024-04-02 13:02:18 (56.5 MB/s) - ‘fannie-mf-commentary-oct-2023.pdf’ saved [249442/249442]



# 2. Split the document into chunks

In [9]:
# split up our document into chunks
docs = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=10)
#filename = 'fannie-mf-commentary-oct-2023.pdf'
filename = '../defaultRAG_schema3.pdf'
#''defaultRAG_schema3.pdf'
reader = PdfReader(filename)

for idx, page in enumerate(reader.pages):
    text = page.extract_text()
    #print(text)
    if len(text) > 0:
        docs.extend(text_splitter.create_documents(texts=[text],metadatas=[{'filename': filename, 'page': idx+1}]))

# show what we got from this code
len(docs)

2

In [10]:
docs

[Document(page_content='CREATE TABLE artists (\n        artist_id integer NOT NULL,\n        name character varying(200),\n        nationality character varying(50),\n        gender character varying(25),\n        birth_year integer,\n        death_year integer,\n        CONSTRAINT artists_pk PRIMARY KEY (artist_id))\n/*\n3 rows from artists table:\n"artist_id""name""nationality""gender""birth_year""death_year"\n12"Jüri Arrak""Estonian""Male"1936\n19"Richard Artschwager" "American""Male"19232013\n22"Isidora Aschheim" "Israeli""Female"\n*/"""', metadata={'filename': '../defaultRAG_schema3.pdf', 'page': 1}),
 Document(page_content='CREATE TABLE artworks (\n        artwork_id integer NOT NULL,\n        title character varying(500),\n        artist_id integer NOT NULL,\n        name character varying(500),\n        date integer,\n        medium character varying(250),\n        dimensions text,\n        acquisition_date text,\n        credit text,\n        catalogue character varying(250),\

# 3. Index the embeddings with FAISS and `all-MiniLM-l6-v2`

In [11]:
embeddings = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-l6-v2',
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

db_faiss = FAISS.from_documents(docs, embeddings)

# 4. Initialize our question, prompt, and context (from FAISS similarity search)

In [12]:
# something like: "what is the multifamily market forecast for 2024?"
# or, "what concerns are there going forward for the apartment industry?"
QUESTION_01= 'How many artists are there'#input()

In [13]:
search_docs = db_faiss.similarity_search(QUESTION_01)
# get the top 3 search results
top_search_docs = search_docs[0]
top_search_docs

Document(page_content='CREATE TABLE artists (\n        artist_id integer NOT NULL,\n        name character varying(200),\n        nationality character varying(50),\n        gender character varying(25),\n        birth_year integer,\n        death_year integer,\n        CONSTRAINT artists_pk PRIMARY KEY (artist_id))\n/*\n3 rows from artists table:\n"artist_id""name""nationality""gender""birth_year""death_year"\n12"Jüri Arrak""Estonian""Male"1936\n19"Richard Artschwager" "American""Male"19232013\n22"Isidora Aschheim" "Israeli""Female"\n*/"""', metadata={'filename': '../defaultRAG_schema3.pdf', 'page': 1})

In [14]:
QUESTION_02 = 'How many artworks are there'#input()
search_docs = db_faiss.similarity_search(QUESTION_02)

top_search_docs = search_docs[0]
top_search_docs

Document(page_content='CREATE TABLE artists (\n        artist_id integer NOT NULL,\n        name character varying(200),\n        nationality character varying(50),\n        gender character varying(25),\n        birth_year integer,\n        death_year integer,\n        CONSTRAINT artists_pk PRIMARY KEY (artist_id))\n/*\n3 rows from artists table:\n"artist_id""name""nationality""gender""birth_year""death_year"\n12"Jüri Arrak""Estonian""Male"1936\n19"Richard Artschwager" "American""Male"19232013\n22"Isidora Aschheim" "Israeli""Female"\n*/"""', metadata={'filename': '../defaultRAG_schema3.pdf', 'page': 1})

In [15]:
docs_and_scores = db_faiss.similarity_search_with_score(QUESTION_02)
docs_and_scores

[(Document(page_content='CREATE TABLE artists (\n        artist_id integer NOT NULL,\n        name character varying(200),\n        nationality character varying(50),\n        gender character varying(25),\n        birth_year integer,\n        death_year integer,\n        CONSTRAINT artists_pk PRIMARY KEY (artist_id))\n/*\n3 rows from artists table:\n"artist_id""name""nationality""gender""birth_year""death_year"\n12"Jüri Arrak""Estonian""Male"1936\n19"Richard Artschwager" "American""Male"19232013\n22"Isidora Aschheim" "Israeli""Female"\n*/"""', metadata={'filename': '../defaultRAG_schema3.pdf', 'page': 1}),
  1.1001904),
 (Document(page_content='CREATE TABLE artworks (\n        artwork_id integer NOT NULL,\n        title character varying(500),\n        artist_id integer NOT NULL,\n        name character varying(500),\n        date integer,\n        medium character varying(250),\n        dimensions text,\n        acquisition_date text,\n        credit text,\n        catalogue characte

In [16]:
from langchain.prompts.prompt import PromptTemplate

_DEFAULT_TEMPLATE = """Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.\n Unless the user specifies in the question a specific number of examples to obtain, query for at most 5 results using the LIMIT clause as per SQLite. You can order the results to return the most informative data in the database.\nNever query for all columns from a table. You must query only the columns that are needed to answer the question. Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.\nPay attention to use date('now') function to get the current date, if the question involves \"today\".
Use the following format:

Question: "Question here"
SQLQuery: "SQL Query to run"
SQLResult: "Result of the SQLQuery"

Only use the following tables:
{table_info}

If someone asks for the art table, they really mean the artworks table.

Only single quotation marks, not double quotation marks in the SQL statement (SQLQuery). Never use " in SQL statement (SQLQuery).

Question: {input}"""
PROMPT = PromptTemplate(
    input_variables=["input", "table_info", "dialect"], template=_DEFAULT_TEMPLATE
)

# 5. Load the LLM (llama 7b vs others)

In [17]:
#pip install accelerate

In [18]:
model_name = 'meta-llama/Llama-2-7b-hf'

#'google/gemma-7b'#'meta-llama/Llama-2-7b'#'meta-llama/Llama-2-7b-hf' #'llama-2-7b-hf/7B'
tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token = access_token)
#config = AutoConfig(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,use_auth_token = access_token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#model_name = 'microsoft/DialoGPT-small'
#model_name = #'mistralai/Mistral-7B-Instruct-v0.1'
model_name = 'jumpstart-dft-meta-textgeneration-llama-2-7b'

#'google/gemma-7b'#'meta-llama/Llama-2-7b'#'meta-llama/Llama-2-7b-hf' #'llama-2-7b-hf/7B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
#config = AutoConfig(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [19]:
tokenizer.eos_token_id

2

In [20]:
tokenizer.eos_token

'</s>'

In [21]:
tokenizer.special_tokens_map

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}

In [22]:
#inputs = tokenizer.encode(QUESTION_01,return_tensors="pt")
#outputs = model.generate(inputs)
#print(tokenizer.decode(outputs[0]))

In [23]:
!pip install torch==2.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
import torch

In [25]:
from transformers import StoppingCriteria, StoppingCriteriaList
from torch import cuda, LongTensor, FloatTensor

def create_stopping_criteria(stop_words, tokenizer, device):

    class StoppingCriteriaSub(StoppingCriteria):
        def __init__(self, stops = [], device=device, encounters = 1):
            super().__init__()
            self.stops = stops = [stop.to(device) for stop in stops]

        def __call__(self, input_ids: LongTensor, scores: FloatTensor) -> bool:
            last_token = input_ids[0][-1]
            for stop in self.stops:
                if tokenizer.decode(stop) == tokenizer.decode(last_token):
                    return True
            return False

    stop_word_ids = [tokenizer(stop_word,
                               return_tensors="pt", 
                               add_special_tokens=False)["input_ids"].squeeze() 
                               for stop_word in stop_words]

    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_word_ids)])
    return stopping_criteria

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
stop_words_list = ["Question","Question:"," Question:",".\n\nQuestion:","\n\nQuestion:"," \n\nQuestion:", " \nQuestion:", " \n\n", ]#["QUESTION:"]
stopping_criteria5 = None
if stop_words_list is not None:
    stopping_criteria5 = create_stopping_criteria(stop_words_list, tokenizer, device)
stopping_criteria5

[<__main__.create_stopping_criteria.<locals>.StoppingCriteriaSub at 0x7fb8d4322380>]

In [26]:
time_1 = time()
query_pipeline = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.001,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        #torch_dtype=torch.float16,
    stopping_criteria=stopping_criteria5,
        device_map="auto",
        max_new_tokens=2048,min_length = 10)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prepare pipeline: 2.963 sec.


In [27]:
%%time
llm_hfp = HuggingFacePipeline(pipeline=query_pipeline)

CPU times: user 745 µs, sys: 1.43 ms, total: 2.18 ms
Wall time: 1.79 ms


In [28]:
QUESTION_01 = "How many artists are there?"
QUESTION_02 = "How many artworks are there?"
QUESTION_03 = "How many artists have French Nationality"
QUESTION_04 = "How many artists have Spanish Nationality"

In [29]:
%time
#output = llm_hfp(QUESTION_01)
#print(output[0:30])

CPU times: user 4 µs, sys: 8 µs, total: 12 µs
Wall time: 24.3 µs


In [30]:
#len(output)

In [31]:
#output

In [32]:
%time
toolkit = SQLDatabaseToolkit(db=db_sql, 
                             llm=llm_hfp)

CPU times: user 6 µs, sys: 11 µs, total: 17 µs
Wall time: 32.7 µs


In [33]:
#!pip install langchain_experimental

In [34]:
from sqlalchemy.exc import ProgrammingError


In [35]:
from langchain.sql_database import SQLDatabase
from langchain_experimental.sql import  SQLDatabaseChain , vector_sql
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
from langchain_experimental.sql import SQLDatabaseSequentialChain

In [36]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_experimental.sql.vector_sql import VectorSQLOutputParser

output_parser = VectorSQLOutputParser.from_embeddings(
    model=embeddings
)

In [37]:
from langchain.chains import LLMChain
from langchain.utilities.sql_database import SQLDatabase
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
from langchain_experimental.sql.prompt import MYSCALE_PROMPT

chain = VectorSQLDatabaseChain(
    llm_chain=LLMChain(
        llm=llm_hfp,
        prompt=PROMPT,
    ),
    top_k=10,
    return_direct=True,
    sql_cmd_parser=output_parser,
    database=db_sql,return_sql=True
)

In [38]:
#chain.run(QUESTION_01)

ans_run1 = chain.run(QUESTION_01)
import re
ans_run_clean = ans_run1.replace('\n',' ')

ans = re.findall(r'SQLQuery\:(.*?)SQLResult', ans_run_clean)
print(ans[-1])
print(db_sql.run(ans[-1]))

  warn_deprecated(


 SELECT COUNT(*) FROM artists 
[(31084,)]


In [39]:
ans_run2 = chain.run(QUESTION_02)
import re
ans_run_clean = ans_run2.replace('\n',' ')

ans = re.findall(r'SQLQuery\:(.*?)SQLResult', ans_run_clean)
print(ans[-1])
print(db_sql.run(ans[-1]))

 SELECT COUNT(*) FROM artworks 
[(303746,)]


In [40]:
ans_run3 = chain.run(QUESTION_03)
import re
ans_run_clean = ans_run3.replace('\n',' ')

ans = re.findall(r'SQLQuery\:(.*?)SQLResult', ans_run_clean)
print(ans[-1])
print(db_sql.run(ans[-1]))

 SELECT COUNT(*) FROM artists WHERE nationality = 'French' 
[(1720,)]


In [41]:
ans_run4 = chain.run(QUESTION_04)
import re
ans_run_clean = ans_run4.replace('\n',' ')

ans = re.findall(r'SQLQuery\:(.*?)SQLResult', ans_run_clean)
print(ans[-1])
print(db_sql.run(ans[-1]))

 SELECT COUNT(*) FROM artists WHERE nationality = 'Spanish' 
[(318,)]
