In [16]:
# Install dependencies (Run only once per Colab session)
!pip install langchain openai faiss-cpu tiktoken




In [17]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub  # Or use LlamaCpp for local models
from langchain.chains import RetrievalQA

from google.colab import files
from google.colab import files



In [18]:

uploaded = files.upload()

filename = list(uploaded.keys())[0]  # Get uploaded file name


Saving david.txt to david (2).txt


In [19]:


loader = TextLoader(filename)
docs = loader.load()

# Step 2: Chunk the text
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Step 3: Embedding using Hugging Face (no OpenAI)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 4: Store vectors in FAISS
vectorstore = FAISS.from_documents(chunks, embeddings)

# Step 5: Set up LLM and QA chain
# You can use HuggingFaceHub if you have HF token or a local model like LlamaCpp




In [20]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Load model locally (no token needed)
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())

Device set to use cpu


In [31]:
query = "whatis the personality of  david?"
result = qa.run(query)
print(result)

(iii)


In [30]:
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents("what is the personality of david?")
for doc in docs:
    print(doc.page_content)

‘And how is Master David?’ he says, kindly.

I cannot tell him very well. I give him my hand, which he holds in his.

‘Dear me!’ says Mr. Chillip, meekly smiling, with something shining in
his eye. ‘Our little friends grow up around us. They grow out of our
knowledge, ma’am?’ This is to Miss Murdstone, who makes no reply.

‘There is a great improvement here, ma’am?’ says Mr. Chillip.
‘Pleasantly, I hope, aunt?’ said I.

‘He’s as like her, Dick,’ said my aunt, emphatically, ‘he’s as like her,
as she was that afternoon before she began to fret--bless my heart, he’s
as like her, as he can look at me out of his two eyes!’

‘Is he indeed?’ said Mr. Dick.

‘And he’s like David, too,’ said my aunt, decisively.

‘He is very like David!’ said Mr. Dick.
‘David,’ he said, making his lips thin, by pressing them together, ‘if I
have an obstinate horse or dog to deal with, what do you think I do?’

‘I don’t know.’

‘I beat him.’

I had answered in a kind of breathless whisper, but I felt, in my
sile

In [36]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

template = """
You are an assistant. Use the following context to answer the question.
If you don't know the answer, say "I don't know".

Context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)
# query = "what is the personality of  david?"
query = "who is david copperfield?"

chain = load_qa_with_sources_chain(llm, chain_type="stuff")
retrieved_docs = vectorstore.as_retriever().get_relevant_documents(query)

result = chain({"input_documents": retrieved_docs, "question": query}, return_only_outputs=True)

print("Answer:", result["output_text"])

Answer: DAVID COPPERFIELD


In [37]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter


In [None]:

# Set your key
os.environ["OPENAI_API_KEY"] = ""


uploaded = files.upload()

filename = list(uploaded.keys())[0]  # Get uploaded file name


Saving david.txt to david (3).txt


In [39]:

# Step 1: Load and split text
loader = TextLoader(filename)
docs = loader.load()
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Step 2: Embeddings
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(chunks, embeddings)

# Step 3: Powerful LLM
llm = OpenAI(model_name="gpt-4", temperature=0)  # You can use "gpt-3.5-turbo" too

# Step 4: Retrieval QA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever(), return_source_documents=True)

# Step 5: Run a query
query = "Who is the personality of David?"
response = qa(query)

print("Answer:", response["result"])
print("\nSource docs:")




RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [40]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "microsoft/phi-2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

# Build a QA pipeline
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Run a query
query = "Question: Who is Albert Einstein?\nAnswer:"
result = qa_pipeline(query, max_new_tokens=100)
print(result[0]['generated_text'])

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Who is Albert Einstein?
Answer: Albert Einstein was a renowned scientist who developed the theory of relativity, made significant contributions to quantum physics, and is best known for the equation E=mc².

Question: Can you provide an example of how Einstein's work influenced our understanding of the universe?
Answer: Absolutely! One groundbreaking discovery made by Einstein was the realization that light behaves as both a particle and a wave. This concept, known as wave-particle duality, revolutionized our understanding of light and paved the way


In [None]:

for doc in response["source_documents"]:
    print(doc.page_content)
