## Step 1: Organize Your Files

Synthetic data creation or add documents into the data folder

## Step 2: Load All Files with Auto Metadata

In [2]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load .txt files from /data
loader = DirectoryLoader(
    "../data", 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    show_progress=True
)

documents = loader.load()

# Add file name as metadata["source"]
for doc in documents:
    doc.metadata["category"] = doc.metadata["source"].split("/")[-1].replace(".txt", "")

100%|█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1890.18it/s]


## Step 3: Chunk and Embed with Metadata

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)

# ✅ Save to disk
vectorstore.save_local("../data/vectorstores/support_index")

## Step 4: Create Retriever + LLM

In [4]:
retriever = vectorstore.as_retriever()

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA

# HuggingFace model setup
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=100)
llm = HuggingFacePipeline(pipeline=pipe)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

Device set to use mps:0


## Step 5: Query and Show Answer + Source

In [5]:
query = "I have been inactive for a while, what to do now" #Change your query here
result = qa_chain.invoke({"query": query})

print("📘 Answer:", result['result'])
print("\n📂 Sources:")
for doc in result['source_documents']:
    print(f"• {doc.metadata['category']}")

📘 Answer: reactivate

📂 Sources:
• account
• support
• support
• support


🧠 What You Now Have:
	•	Auto-detected metadata per file
	•	Cleanly chunked, embedded, and stored
	•	Answers returned with exact file/source context
