In [1]:
# Install the Libraries
!pip install -q langchain torch transformers sentence-transformers datasets faiss-cpu unstructured chromadb gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
# Import Libraries
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from langchain import HuggingFacePipeline, LLMChain
from langchain.chains import RetrievalQA, StuffDocumentsChain
from langchain.prompts import PromptTemplate
import gradio as gr
import faiss

In [None]:
# Load Documents from Google Drive
loader = DirectoryLoader('/content/drive/MyDrive/preprocessed_fyp')
data = loader.load()


In [61]:
# Split Documents into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", " ", ""]
)
documents = text_splitter.split_documents(data)


In [78]:
# Text Embedding Model
embeddings = HuggingFaceEmbeddings(model_name="distilbert-base-uncased")



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [82]:
# Create the Vector Store with Chroma
db = FAISS.from_documents(documents, embeddings)

In [83]:
# Test Similarity Search
question = "What is chemical engineering?"
search_docs = db.similarity_search(question)
print(f"Sample Search Result: {search_docs[0].page_content}")

Sample Search Result: Description: and s"


In [85]:
# Prepare the LLM Model (distilGPT2)
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [86]:
# Create Text Generation Pipeline with `max_new_tokens`
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,  # Adjusted max_length
    max_new_tokens=200  # Controls the number of tokens generated
)

In [87]:
# Create HuggingFacePipeline LLM
llm = HuggingFacePipeline(pipeline=text_generator)

In [88]:
# Create Retriever
retriever = db.as_retriever(search_kwargs={"k":4})

In [89]:
prompt_template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
"""

In [90]:
# Create the LLM Chain
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt
)

In [91]:
# Create the StuffDocumentsChain
stuff_docs_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context"  # Specify the name of the variable that will store the retrieved documents
)

In [92]:
# Combine into RetrievalQA Chain
qa = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=stuff_docs_chain
)

In [93]:
def query_chatbot(question: str) -> str:
    try:
        result = qa.run(question)
        # Extract the answer from the result
        answer = result.split("Helpful Answer:")[1].strip()
        return answer
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [94]:
with gr.Blocks() as demo:
    gr.Markdown("AASTUChat")
    question_input = gr.Textbox(label="Ask a Question")
    answer_output = gr.Textbox(label="Answer", interactive=False)
    submit_button = gr.Button("Submit")
    submit_button.click(fn=query_chatbot, inputs=question_input, outputs=answer_output)

# Launch Gradio App
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://1f949a7e5dfdd4b848.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


