In [1]:
!pip install langchain==0.3.1 langchain-community==0.3.1 langchain-text-splitters langchain-huggingface faiss-cpu PyPDF2 python-docx sentence-transformers




In [2]:
import os
import faiss
import numpy as np
from io import BytesIO
from PyPDF2 import PdfReader
from docx import Document

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import WebBaseLoader

# Conversational chain (IMPORTANT)
from langchain.chains import ConversationalRetrievalChain

# ChatModel (THIS IS THE FIX)
from langchain_huggingface import ChatHuggingFace




In [3]:
from getpass import getpass

huggingface_api_key = getpass("Enter your HuggingFace API key: ")
os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_api_key


Enter your HuggingFace API key: ··········


In [4]:
def process_input(input_type, input_data):
    text = ""

    if input_type == "PDF":
        for f in input_data:
            pdf = PdfReader(f)
            for page in pdf.pages:
                t = page.extract_text()
                if t:
                    text += t + "\n"

    elif input_type == "Text":
        text = input_data

    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(text)

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",
        model_kwargs={"device": "cpu"}
    )

    dim = len(embeddings.embed_query("hello"))
    index = faiss.IndexFlatL2(dim)

    vectorstore = FAISS(
        embedding_function=embeddings.embed_query,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )

    vectorstore.add_texts(chunks)
    return vectorstore


In [5]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.chains import ConversationalRetrievalChain

def build_llama_conversational_chain(vectorstore):

    endpoint = HuggingFaceEndpoint(
        repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
        task="conversational",
        temperature=0.5,
        max_new_tokens=512,
        token=huggingface_api_key
    )

    llm = ChatHuggingFace(
        llm=endpoint   # REQUIRED
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        return_source_documents=False
    )
    return chain



In [6]:
def chat_with_pdf(chain):
    chat_history = []

    while True:
        q = input("Ask: ")
        if q.lower() == "exit":
            break

        result = chain({"question": q, "chat_history": chat_history})
        ans = result["answer"]

        print("\nAI:", ans)
        chat_history.append((q, ans))



In [7]:
from google.colab import files

uploaded = files.upload()
pdf_files = [BytesIO(data) for data in uploaded.values()]

vs = process_input("PDF", pdf_files)
chain = build_llama_conversational_chain(vs)
chat_with_pdf(chain)


Saving ShivamyadavCV.pdf to ShivamyadavCV.pdf


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

                    token was transferred to model_kwargs.
                    Please make sure that token is what you intended.


Ask: What is email in pdf


  result = chain({"question": q, "chat_history": chat_history})



AI: I don't know.
Ask: What is project mentioned in pdf

AI: I don't have information about a project mentioned in a PDF. However, according to the given context, Shivam Yadav has mentioned some projects in his profile:

1. Fake News Detection System
2. Questing Answering Gen-AI System-NLP

Please let me know if any of these projects match the one mentioned in the PDF or if you need further assistance.
Ask: yes both are matched

AI: Based on the information provided in the profile, I can confirm that the projects "Fake News Detection System" and "Questing Answering Gen-AI System-NLP" are indeed the projects mentioned in the profile of Shivam Yadav.

However, I need the content of the PDF to compare it with the projects in the profile. Since I don't have the content of the PDF, I cannot confirm whether the projects in the PDF match the ones in the profile or not.
Ask: Tell me the acheivement of shivam yadav

AI: Based on the provided information, the achievements of Shivam Yadav are:



KeyboardInterrupt: Interrupted by user