In [1]:
import wikipediaapi
from langchain import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

In [2]:
from pypdf import PdfReader 
  
# creating a pdf reader object 
reader = PdfReader('2024-dbir-data-breach-investigations-report (1).pdf') 
  

  
# creating a page object 
page = reader.pages[10] 
  
# extracting text from page 
data=page.extract_text()

In [3]:
data = data.replace('\n'," ")
data

'11  2024 DBIR Results and analysisResults   and analysis:  Introduction Hello, friends, and welcome to the “Results and analysis” section. This is where we  cover the highlights we found in the data this year. This dataset is collected from a  variety of sources, including our own VTRAC investigators, reports provided by our  data contributors and publicly disclosed security incidents.1 Because data contributors come and go, one of our priorities is to make sure  we can get broad representation on different types of security incidents and the  countries where they occur. This ebb and flow of contributors obviously influences  our dataset, and we will do our best to provide context on those potential biases  where applicable. This year we onboarded a good number of new contributors and reached an  exciting milestone of more than 10,000 breaches analyzed in a single edition.2   It is an enormous amount of work to organize and analyze, but it is also incredibly  gratifying to be able to 

In [4]:
# wiki_wiki = wikipediaapi.Wikipedia(
#     user_agent='Giskard application',
#     language='en',
#     extract_format=wikipediaapi.ExtractFormat.WIKI
# )
# p_wiki = wiki_wiki.page('Virat_Kohli')
# data = p_wiki.text

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, add_start_index=True)

In [6]:
docs = text_splitter.create_documents([data])

In [7]:
type(docs)

list

In [8]:
embeddings = OllamaEmbeddings(
    model='nomic-embed-text'
)

In [9]:
import chromadb
from langchain_chroma import Chroma
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

client = chromadb.PersistentClient(path="./chroma_db")

db = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory="./chroma_db",
    
)

In [10]:
retriever = db.as_retriever()

In [11]:
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [12]:
PROMPT_TEMPLATE = """You are a cybersecurity and ai expert.
Your task is to answer common questions from the report with the given context.
You will be given a question and relevant excerpts from the report.
Provide short and clear answers.

Context:
{context}

Question:
{question}

Your answer:
"""

llm = ChatOllama(model="llama3",temperature=0)
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["question", "context"])
qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever, prompt=prompt)

# Test that everything works
qa_chain.invoke({"query": "what is the name of the report"})

{'query': 'what is the name of the report',
 'result': 'Based on the provided context, I can infer that the report is the "2023 DBIR" (Data Breach Investigations Report).'}

In [13]:
import giskard
from openai import OpenAI
from giskard.llm.client.openai import OpenAIClient
from giskard.llm.client.mistral import MistralClient

# Setup the Ollama client with API key and base URL
_client = OpenAI(base_url="http://localhost:11434/v1/", api_key="ollama")
oc = OpenAIClient(model="llama3", client=_client)
giskard.llm.set_default_client(oc)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
import pandas as pd


def model_predict(df: pd.DataFrame):
    """Wraps the LLM call in a simple Python function.

    The function takes a pandas.DataFrame containing the input variables needed
    by your model, and must return a list of the outputs (one for each row).
    """
    return [qa_chain.invoke({"query": question}) for question in df["question"]]


# Don’t forget to fill the `name` and `description`: they are used by Giskard
# to generate domain-specific tests.
giskard_model = giskard.Model(
    model=model_predict,
    model_type="text_generation",
    name="Data breach report Question Answering",
    description="This model answers any question from the given page of the report",
    feature_names=["question"],
)

2024-05-31 07:47:53,547 pid:15328 MainThread giskard.models.automodel INFO     Your 'prediction_function' is successfully wrapped by Giskard's 'PredictionFunctionModel' wrapper class.


In [15]:
examples = ["What is the milestone achieved?","What is the percentage increase in increase in the exploitation of vulnerabilities?"]
giskard_dataset = giskard.Dataset(pd.DataFrame({"question": examples}), target=None)

print(giskard_model.predict(giskard_dataset).prediction)

2024-05-31 07:47:53,603 pid:15328 MainThread giskard.datasets.base INFO     Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
2024-05-31 07:47:53,624 pid:15328 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
2024-05-31 07:49:49,774 pid:15328 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (2, 1) executed in 0:01:56.165062
[{'query': 'What is the milestone achieved?', 'result': 'According to excerpt 2, the milestone achieved is passing the cumulative 1 million incident milestone.'}
 {'query': 'What is the percentage increase in increase in the exploitation of vulnerabilities?', 'result': 'According to the report, there has been a 180% increase in the exploitation of vulnerabilities as the critical path action to initiate a breach.'}]


In [16]:
# df.columns = df.columns.str.replace(' ', '')

In [17]:
report = giskard.scan(giskard_model, giskard_dataset, only="hallucination")

🔎 Running scan…
Estimated calls to your model: ~30
Estimated LLM calls for evaluation: 22

2024-05-31 07:49:49,827 pid:15328 MainThread giskard.scanner.logger INFO     Running detectors: ['LLMBasicSycophancyDetector', 'LLMImplausibleOutputDetector']
Running detector LLMBasicSycophancyDetector…
2024-05-31 07:53:02,371 pid:15328 MainThread httpx        INFO     HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-31 07:53:02,380 pid:15328 MainThread giskard.scanner.logger ERROR    Detector LLMBasicSycophancyDetector failed with error: "None of [Index(['question'], dtype='object')] are in the [columns]"
Traceback (most recent call last):
  File "c:\Users\sathi\OneDrive\Desktop\chatbot\.venv\lib\site-packages\giskard\scanner\scanner.py", line 152, in _run_detectors
    detected_issues = detector.run(model, dataset, features=features)
  File "c:\Users\sathi\OneDrive\Desktop\chatbot\.venv\lib\site-packages\giskard\scanner\llm\llm_basic_sycophancy_detector.p



In [18]:
full_report = giskard.scan(giskard_model, giskard_dataset)

🔎 Running scan…
Estimated calls to your model: ~365
Estimated LLM calls for evaluation: 148

2024-05-31 08:12:40,217 pid:15328 MainThread giskard.scanner.logger INFO     Running detectors: ['LLMBasicSycophancyDetector', 'LLMCharsInjectionDetector', 'LLMHarmfulContentDetector', 'LLMImplausibleOutputDetector', 'LLMInformationDisclosureDetector', 'LLMOutputFormattingDetector', 'LLMPromptInjectionDetector', 'LLMStereotypesDetector', 'LLMFaithfulnessDetector']
Running detector LLMBasicSycophancyDetector…
2024-05-31 08:15:36,970 pid:15328 MainThread httpx        INFO     HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-31 08:15:36,977 pid:15328 MainThread giskard.scanner.logger ERROR    Detector LLMBasicSycophancyDetector failed with error: "None of [Index(['question'], dtype='object')] are in the [columns]"
Traceback (most recent call last):
  File "c:\Users\sathi\OneDrive\Desktop\chatbot\.venv\lib\site-packages\giskard\scanner\scanner.py", line 152, i

In [None]:
display(full_report)

# Save it to a file
full_report.to_html("scan_report.html")