In [1]:
import os
from pathlib import Path
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv

In [2]:
# set goofle api key in os.env 
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")  
LANGCHAIN_PROJECT = os.getenv("LANGCHAIN_PROJECT")  

os.environ['GOOGLE_API_KEY'] =  GOOGLE_API_KEY
os.environ['LANGCHAIN_API_KEY'] =  LANGCHAIN_API_KEY
os.environ['LANGCHAIN_PROJECT'] =  LANGCHAIN_PROJECT

In [3]:
# Reading PDF file from resources 
file_path = Path(r"D:\SHUBHAM\git_repo\ask-your-pdf\resources\Shubham_Karale_DS_2025_v1.pdf")
pdf_reader= PdfReader(file_path)

In [4]:
# extract all text from pdf 
text=""

for page in pdf_reader.pages:
    # print(page.extract_text())
    text = text+page.extract_text()

print(text)

Shubham Karale 
Data Scientist 
Experienced Data Scientist with 
7 years of experience
, including 
3 years specializing in Data Science & MLOps
.
Currently at 
Capgemini
, optimizing ML models and 
migrating SAS-based ETL pipelines to PySpark
, reducing model
inference time by 
30%
 and improving accuracy by 
15%
. Proﬁcient in 
Python, PySpark, Jenkins, Cloudera, AWS, and
ML model testing
. Strong expertise in 
feature engineering, hyperparameter tuning, cloud deployment, and
MLOps pipelines
. Seeking to leverage expertise in data-driven solutions and cloud-based machine learning. 
skarale63@gmail.com 
+91 96896 49778 
pune, India 
linkedin.com/in/shubham-karale-b343a7221 
WORK EXPERIENCE 
Data Scientist 
Capgemini Technology Services India Limited 
05/2024 - Present
, 
 
Migrated 
SAS-based data pipelines
 to 
PySpark
, optimizing data extraction for ML
models. 
Extracted only relevant features
 from legacy SAS pipelines, reducing model
inference time by 
30%
 and improving accuracy

In [5]:
# Text convert into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chunks = text_splitter.split_text(text)
chunks

['Shubham Karale \nData Scientist \nExperienced Data Scientist with \n7 years of experience\n, including \n3 years specializing in Data Science & MLOps\n.\nCurrently at \nCapgemini\n, optimizing ML models and \nmigrating SAS-based ETL pipelines to PySpark\n, reducing model\ninference time by \n30%\n and improving accuracy by \n15%\n. Proﬁcient in \nPython, PySpark, Jenkins, Cloudera, AWS, and\nML model testing\n. Strong expertise in \nfeature engineering, hyperparameter tuning, cloud deployment, and\nMLOps pipelines\n. Seeking to leverage expertise in data-driven solutions and cloud-based machine learning. \nskarale63@gmail.com \n+91 96896 49778 \npune, India \nlinkedin.com/in/shubham-karale-b343a7221 \nWORK EXPERIENCE \nData Scientist \nCapgemini Technology Services India Limited \n05/2024 - Present\n, \n \nMigrated \nSAS-based data pipelines\n to \nPySpark\n, optimizing data extraction for ML\nmodels. \nExtracted only relevant features\n from legacy SAS pipelines, reducing model\ninf

In [6]:
# apply embedding and store in faiss
embeddings = OllamaEmbeddings()
vector_store = FAISS.from_texts(chunks, embedding=embeddings)

vector_store

  embeddings = OllamaEmbeddings()


<langchain_community.vectorstores.faiss.FAISS at 0x1f26c3fc1a0>

In [7]:
# use LLM model for chat
llm=Ollama()
memory = ConversationBufferMemory(memory_key = "chat_history", return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vector_store.as_retriever(), memory=memory)


  llm=Ollama()
  memory = ConversationBufferMemory(memory_key = "chat_history", return_messages=True)


In [30]:
import pandas as pd
from io import StringIO

In [None]:
# Takes user input and passes it to the LLM pipeline
user_question = "What is skill of shubham"
response = conversation_chain({'question': user_question})

In [12]:
response

{'question': 'What is skill of shubham',
 'chat_history': [HumanMessage(content='What is Name', additional_kwargs={}, response_metadata={}),
  AIMessage(content="Based on the provided context, I cannot determine the name of the person or organization mentioned. The text does not provide any clear clues or hints about the identity of the person or organization. Therefore, I'm afraid I cannot give you a helpful answer to this question.", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What is skill of shubham', additional_kwargs={}, response_metadata={}),
  AIMessage(content="Based on the provided context, Shubham Karale's skills include:\n\n1. Python\n2. PySpark\n3. Pandas\n4. NumPy\n5. MLOps\n6. Jenkins\n7. Cloudera\n8. AWS\n9. Git Bash\n10. Shell script\n11. Spark\n12. HDFS\n13. Data Exploration and Visualization\n14. PyCharm\n15. Data Visualization\n16. Data Warehouse\n17. ML Models deployments\n\nShubham is proficient in these skills and has experience working w

In [15]:
chatHistory = response["chat_history"]
chatHistory

[HumanMessage(content='What is Name', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Based on the provided context, I cannot determine the name of the person or organization mentioned. The text does not provide any clear clues or hints about the identity of the person or organization. Therefore, I'm afraid I cannot give you a helpful answer to this question.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='What is skill of shubham', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Based on the provided context, Shubham Karale's skills include:\n\n1. Python\n2. PySpark\n3. Pandas\n4. NumPy\n5. MLOps\n6. Jenkins\n7. Cloudera\n8. AWS\n9. Git Bash\n10. Shell script\n11. Spark\n12. HDFS\n13. Data Exploration and Visualization\n14. PyCharm\n15. Data Visualization\n16. Data Warehouse\n17. ML Models deployments\n\nShubham is proficient in these skills and has experience working with them in his professional projects.", additional_kwargs={}

In [None]:
# chat history of QaN
for i, message in enumerate(chatHistory):
    print(i, message)
    if i%2 == 0:
        print(f"User:  {message.content}")
    else:
        print(f"Reply:  {message.content}")

0 content='What is Name' additional_kwargs={} response_metadata={}
User:  What is Name
1 content="Based on the provided context, I cannot determine the name of the person or organization mentioned. The text does not provide any clear clues or hints about the identity of the person or organization. Therefore, I'm afraid I cannot give you a helpful answer to this question." additional_kwargs={} response_metadata={}
User:  Based on the provided context, I cannot determine the name of the person or organization mentioned. The text does not provide any clear clues or hints about the identity of the person or organization. Therefore, I'm afraid I cannot give you a helpful answer to this question.
2 content='What is skill of shubham' additional_kwargs={} response_metadata={}
User:  What is skill of shubham
3 content="Based on the provided context, Shubham Karale's skills include:\n\n1. Python\n2. PySpark\n3. Pandas\n4. NumPy\n5. MLOps\n6. Jenkins\n7. Cloudera\n8. AWS\n9. Git Bash\n10. Shell s

In [47]:
# save chart history in dict formate

question = []
answer = []

for i, message in enumerate(chatHistory):
    if i%2 == 0:
        question.append(message.content)
    else:
        answer.append(message.content)

qa_pairs = {"Question": question, "Answer": answer}

qa_pairs

{'Question': ['What is Name', 'What is skill of shubham'],
 'Answer': ["Based on the provided context, I cannot determine the name of the person or organization mentioned. The text does not provide any clear clues or hints about the identity of the person or organization. Therefore, I'm afraid I cannot give you a helpful answer to this question.",
  "Based on the provided context, Shubham Karale's skills include:\n\n1. Python\n2. PySpark\n3. Pandas\n4. NumPy\n5. MLOps\n6. Jenkins\n7. Cloudera\n8. AWS\n9. Git Bash\n10. Shell script\n11. Spark\n12. HDFS\n13. Data Exploration and Visualization\n14. PyCharm\n15. Data Visualization\n16. Data Warehouse\n17. ML Models deployments\n\nShubham is proficient in these skills and has experience working with them in his professional projects."]}

In [48]:
# convert into dataframe
df = pd.DataFrame(qa_pairs)


In [49]:
df

Unnamed: 0,Question,Answer
0,What is Name,"Based on the provided context, I cannot determ..."
1,What is skill of shubham,"Based on the provided context, Shubham Karale'..."


In [None]:
# sacving to csv formate
csv_path = Path(r"D:\SHUBHAM\git_repo\ask-your-pdf\notebook\qa_history.csv")
df.to_csv(csv_path, index=False)