In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
#import kagglehub
#yapwh1208_chatbot_ai_q_and_a_path = kagglehub.dataset_download('yapwh1208/chatbot-ai-q-and-a')

#print('Data source import complete.')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install --upgrade langchain langchain-core langchain-community



In [4]:
!pip install faiss-cpu



In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_classic.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline



In [7]:
file_path = '/content/drive/MyDrive/CHATBOT-DATASET/full_dataset.csv'
data = pd.read_csv(file_path)

In [8]:
'''from datasets import load_dataset
data = load_dataset("bot-remains/student-assistance-chatbot")'''

'from datasets import load_dataset\ndata = load_dataset("bot-remains/student-assistance-chatbot")'

In [9]:
#print(data["train"])

In [10]:
data.shape

(22571, 2)

In [11]:
data.isna().sum()

Unnamed: 0,0
input,0
target,0


In [12]:
data_qs = data['input']
data_as = data['target']

In [13]:
data_qs[:10]

Unnamed: 0,input
0,Who did the first work generally recognized as...
1,What sources was drawn on the formation of the...
2,Who created the Hebbian learning rule?
3,When the first neural network is built?
4,What is the first neural network called?
5,"""Who introduced the Turing test"
6,Alan Turing prefer what method on creating hum...
7,Who presented the Logic Theorist (LT)?
8,What does General Problem Solver (GPS) is desi...
9,Which model was robably the first program to e...


In [14]:
data_as[:10]

Unnamed: 0,target
0,Warren McCulloch and Walter Pitts (1943).\n
1,knowledge of the basic physiology and function...
2,Donald Hebb (1949).\n
3,1950.\n
4,The SNARC.\n
5,machine learning
6,He prefer to develop learning algorithms and t...
7,Allen Newell and Herbert Simon from Carnegie T...
8,GPS was designed from the start to imitate hum...
9,General Problem Solver (GPS).\n


In [16]:
documents = []
for q, a in zip(data_qs, data_as):
    content = f"Question: {q}. Answer: {a}."
    documents.append(Document(page_content=content))

In [None]:
'''text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)'''

In [17]:
documents = [doc for doc in documents if len(doc.page_content.split()) > 2]

In [73]:
def doc_splitter(documents:list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    return text_splitter.split_documents(documents)

In [74]:
'''documents = []
for row in data['train']:
    content = row['output']
    documents.append(Document(page_content=content, metadata={'category':row['category']}))'''

"documents = []\nfor row in data['train']:\n    content = row['output']\n    documents.append(Document(page_content=content, metadata={'category':row['category']}))"

In [75]:
embeddings_model = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

In [76]:
split_docs = doc_splitter(documents)

In [77]:
vector_store = FAISS.from_documents(split_docs, embeddings_model)

In [78]:
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

In [79]:
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="""
Using the context below, answer the question in a clear and friendly way.
You may rephrase the information, but do NOT hallucinate or add facts that are not in the context.

Context:
{context}

Question:
{question}

Answer in a friendly, natural style:"""
)


In [80]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [81]:
PIPELINE = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    do_sample=True
    #top_p=0.9,
    #temperature=0.7
    )
llm = HuggingFacePipeline(pipeline=PIPELINE)

Device set to use cpu


In [82]:
def create_retrieval_chain(llm, prompt, retriever):
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt}
)

retrieval_chain=create_retrieval_chain(llm, prompt, retriever)

In [90]:
query_data = "How do I apply for admission to an engineering college in Rajasthan?"

result = retrieval_chain.invoke({"query": query_data})
print(result["result"])

You can apply for admission to an engineering college in Rajasthan by filling out the application form.


In [84]:
query = "What is photosynthesis?"
result = retrieval_chain.invoke({"query": query})
print(result["result"])

Photosynthesis is a chemical change in a plant.


In [85]:
query = "How are you?"
result = retrieval_chain.invoke({"query": query})
print(result["result"])

I'm fine.


In [86]:
query = "Who presented the Logic Theorist (LT)?"
result = retrieval_chain.invoke({"query": query})
print(result["result"])

Allen Newell and Herbert Simon from Carnegie Tech.


In [89]:
query = "What is SLE disease?"
result = retrieval_chain.invoke({"query": query})
print(result["result"])

SLE disease is a disease that affects the nervous system.
