In [25]:
!pip install langchain-community faiss-cpu tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
Collecting regex>=2022.1.18
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.1/775.1 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m[31m5.2 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.5.15 tiktoken-0.7.0


In [7]:
from PyPDF2 import PdfReader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

from constants import gemini_key

In [8]:
pdfreader = PdfReader('/home/ronit/Ronit/Book/DSML.pdf')

In [10]:
from typing_extensions import Concatenate

# read text from pdf
raw_text = ''
for page in pdfreader.pages:
    raw_text += page.extract_text()

print(raw_text)

Data Science and Machine Learning
Mathematical and Statistical Methods
Dirk P. Kroese, Zdravko I. Botev, Thomas Taimre, Radislav Vaisman
8th May 2022To my wife and daughters: Lesley, Elise, and Jessica
—DPK
To Sarah, Soﬁa, and my parents
—ZIB
To my grandparents: Arno, Harry, Juta, and Maila
—TT
To Valerie
—RVCONTENTS
Preface xiii
Notation xvii
1 Importing, Summarizing, and Visualizing Data 1
1.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
1.2 Structuring Features According to Type . . . . . . . . . . . . . . . . . . 3
1.3 Summary Tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6
1.4 Summary Statistics . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7
1.5 Visualizing Data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8
1.5.1 Plotting Qualitative Variables . . . . . . . . . . . . . . . . . . . . 9
1.5.2 Plotting Quantitative Variables . . . . . . . . . . . . . . . . . . . 9
1.5.3 Data Visualization in a B

In [11]:
# split text into character
splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=800,
    chunk_overlap=200,
    length_function=len
)

text = splitter.split_text(raw_text)

In [12]:
len(text)

1693

In [23]:
# get embeddings 
embeddings = GoogleGenerativeAIEmbeddings(google_api_key=gemini_key, model="models/embedding-001")

In [24]:
document_search = FAISS.from_texts(text, embeddings)

In [26]:
document_search

<langchain_community.vectorstores.faiss.FAISS at 0x721efbee7100>

In [27]:
from langchain.chains.question_answering import load_qa_chain
from langchain_google_genai import ChatGoogleGenerativeAI

In [30]:
chain = load_qa_chain(ChatGoogleGenerativeAI(google_api_key=gemini_key, model='gemini-pro'), chain_type='stuff')

In [34]:
query = "Give the detailed Summary About chapter Deep Learning."
docs = document_search.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

In [35]:
answer

"**Chapter: Deep Learning**\n\n**Introduction**\n\n* Deep learning refers to machine learning models with multiple layers of abstraction that can learn complex patterns and representations from data.\n\n**Artificial Neural Networks (ANNs)**\n\n* ANNs are the foundation of deep learning, mimicking the structure and function of the human brain.\n* They consist of interconnected layers of nodes (neurons) that process and transform data.\n* Each layer learns specific features from the input data, building up to complex representations at higher layers.\n\n**Convolutional Neural Networks (CNNs)**\n\n* CNNs are specialized ANNs designed for processing spatial data, such as images.\n* They use convolutional operations to extract features from the input data, preserving spatial information.\n* CNNs are widely used in image recognition, object detection, and facial recognition tasks.\n\n**Recurrent Neural Networks (RNNs)**\n\n* RNNs are ANNs that can process sequential data, such as text or tim