In [1]:
%pip install "giskard[llm]>=2.0.0b" --upgrade

Collecting giskard[llm]>=2.0.0b
  Obtaining dependency information for giskard[llm]>=2.0.0b from https://files.pythonhosted.org/packages/64/d8/388e4b720f54418e86255c14dc41e1df19461764b35984c3c017b49d54c9/giskard-2.0.2-py3-none-any.whl.metadata
  Downloading giskard-2.0.2-py3-none-any.whl.metadata (13 kB)
Collecting cloudpickle>=1.1.1 (from giskard[llm]>=2.0.0b)
  Obtaining dependency information for cloudpickle>=1.1.1 from https://files.pythonhosted.org/packages/96/43/dae06432d0c4b1dc9e9149ad37b4ca8384cf6eb7700cd9215b177b914f0a/cloudpickle-3.0.0-py3-none-any.whl.metadata
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting zstandard>=0.10.0 (from giskard[llm]>=2.0.0b)
  Obtaining dependency information for zstandard>=0.10.0 from https://files.pythonhosted.org/packages/54/fc/c1b1a1e140451f3362789f546731b3ef36c78668be19d7fc6fbd4326b535/zstandard-0.22.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading zstandard-0.22.0-cp311-cp311-macosx_11_0_arm64.whl.meta

In [2]:
%pip install langchain pypdf faiss-cpu openai==0.27.0 tiktoken

Collecting pypdf
  Obtaining dependency information for pypdf from https://files.pythonhosted.org/packages/74/a9/5ccde1312650dd03e65350224fea85d9a430c182a01f056599cbb76f7390/pypdf-3.17.0-py3-none-any.whl.metadata
  Using cached pypdf-3.17.0-py3-none-any.whl.metadata (7.5 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.7.4-cp311-cp311-macosx_11_0_arm64.whl (2.7 MB)
Collecting openai==0.27.0
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tiktoken
  Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/fb/2a/3d02ef030f387c373acbeca6d5a2307405a1da735285ec12a9ed0b6302ea/tiktoken-0.5.1-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached tiktoken-0.5.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Using cached pypdf-3.17.0-py3-none-any.whl (277 kB)
Using cached tiktoken-0.5.1-cp311-

In [15]:
import os
from pathlib import Path
import openai
import pandas as pd
from langchain.llms import openai
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.base import Chain
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA, load_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

from giskard import Dataset, Model, scan, GiskardClient

In [4]:
os.environ["OPENAI_API_KEY"] = "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"

In [5]:
pd.set_option('display.max_colwidth', None)

In [6]:
URL = "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf"

In [7]:
LLM_NAME = "gpt-3.5-turbo"

In [8]:
TEXT_COLUMN_QUERY = "query"

In [11]:
PROMPT_TEMPLATE = """You are the climate assistant, a helpful assistant made by Giskard.
Your task is to answer common questions on climate change.
You will be given a question and relevant excerpts from the IPCC Climate Change Synthesis Report (2023).
Please provide short and clear answers based on the provided context. Be polite and Helpful,

Context:
{context}

Question:
{question}

Your answer:
"""

In [14]:
def get_context_storage() -> FAISS:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, add_start_index=True)
    docs = PyPDFLoader(URL).load_and_split(text_splitter)
    db = FAISS.from_documents(docs, OpenAIEmbeddings())
    return db

In [18]:
llm = OpenAI(temperature=0.1, model_name=LLM_NAME)



In [16]:
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"])

In [20]:
def get_context_storage() -> FAISS:
    """Initialize a vector storage of embedded IPCC report chunks (context)."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, add_start_index=True)
    docs = PyPDFLoader(IPCC_REPORT_URL).load_and_split(text_splitter)
    db = FAISS.from_documents(docs, OpenAIEmbeddings())
    return db


# Create the chain.
llm = OpenAI(temperature=0)
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["question", "context"])
climate_qa_chain = RetrievalQA.from_llm(llm=llm, retriever=get_context_storage().as_retriever(), prompt=prompt)

# Test the chain.
climate_qa_chain("Is sea level rise avoidable? When will it stop?")

In [None]:
# Define a custom Giskard model wrapper for the serialization.
class FAISSRAGModel(Model):
    def model_predict(self, df: pd.DataFrame) -> pd.DataFrame:
        return df[TEXT_COLUMN_NAME].apply(lambda x: self.model.run({"query": x}))

    def save_model(self, path: str):
        out_dest = Path(path)
        # Save the chain object
        self.model.save(out_dest.joinpath("model.json"))

        # Save the FAISS-based retriever
        db = self.model.retriever.vectorstore
        db.save_local(out_dest.joinpath("faiss"))

    @classmethod
    def load_model(cls, path: str) -> Chain:
        src = Path(path)

        # Load the FAISS-based retriever
        db = FAISS.load_local(src.joinpath("faiss"), OpenAIEmbeddings())

        # Load the chain, passing the retriever
        chain = load_chain(src.joinpath("model.json"), retriever=db.as_retriever())
        return chain


# Wrap the QA chain
giskard_model = FAISSRAGModel(
    model=climate_qa_chain,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="text_generation",  # Either regression, classification or text_generation.
    name="Climate Change Question Answering",  # Optional.
    description="This model answers any question about climate change based on IPCC reports",  # Is used to generate prompts during the scan.
    feature_names=[TEXT_COLUMN_NAME]  # Default: all columns of your dataset.
)

# Optional: Wrap a dataframe of sample input prompts to validate the model wrapping and to narrow specific tests' queries.
giskard_dataset = Dataset(pd.DataFrame({
    TEXT_COLUMN_NAME: [
        "According to the IPCC report, what are key risks in the Europe?",
        "Is sea level rise avoidable? When will it stop?"
    ]
}))
     

In [None]:
# Validate the wrapped model and dataset.
print(giskard_model.predict(giskard_dataset).prediction)     

In [None]:
results = scan(giskard_model, giskard_dataset, only="hallucination")

In [None]:
display(results)

In [None]:

test_suite = results.generate_test_suite("Test suite generated by scan")
test_suite.run()
     