In [1]:
# Install required libraries (uncomment if needed)

#pip install langchain openai faiss-cpu chromadb tiktoken unstructured pymupdf


In [2]:
#pip install langchain-community
#pip install transformers
#pip install sentence_transformers
#pip install accelerate

In [3]:
# 1. Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS, Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline

In [4]:
from langchain.schema import Document

In [5]:
# 2. Load Documents

documents = """Problem: ValueError: The feature names should match those that were passed during fit. 
Summarizing: The error ValueError: The feature names should match those that were passed during fit. usually means that the columns in your x DataFrame do not exactly match the feature names that your model or transformer was trained on. 
Possible causes: The columns in x after one-hot encoding do not exactly match feature_names (missing or extra columns). The order of columns may be different. Some expected columns are missing from your current data. 
How to debug/fix: Add a check to compare x.columns and feature_names before reindexing. You can also fill missing columns with zeros to ensure all expected features are present."""

print(f"Loaded {len(documents)} documents")

Loaded 722 documents


In [6]:
# 3. Chunk Documents

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,     # size of each chunk
    chunk_overlap=20,   # overlap between chunks for context
    length_function=len,
    separators=["\n\n", "\n"]
)

In [7]:
chunks = text_splitter.split_text(documents)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(chunk)
    print()

Number of chunks: 4

--- Chunk 1 ---
Problem: ValueError: The feature names should match those that were passed during fit.

--- Chunk 2 ---

Summarizing: The error ValueError: The feature names should match those that were passed during fit. usually means that the columns in your x DataFrame do not exactly match the feature names that your model or transformer was trained on. 

--- Chunk 3 ---

Possible causes: The columns in x after one-hot encoding do not exactly match feature_names (missing or extra columns). The order of columns may be different. Some expected columns are missing from your current data. 

--- Chunk 4 ---

How to debug/fix: Add a check to compare x.columns and feature_names before reindexing. You can also fill missing columns with zeros to ensure all expected features are present.



In [8]:
documents = [Document(page_content=chunk, metadata={"source": "custom_text"}) 
             for chunk in chunks]

In [9]:
# 4. Create Embeddings + Vector Store

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [10]:
vectorstore = FAISS.from_documents(documents, embeddings)

In [11]:
# 5. Create Retriever

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [15]:
# 6. Build RAG Chain
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    device_map="auto",
    max_new_tokens=512,
    temperature=0.1
)

llm = HuggingFacePipeline(pipeline=generator)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [17]:
# 8. Query the RAG System

query = "How to fix the error ValueError: The feature names should match those that were passed during fit."
result = qa_chain(query)

print("\n Question:", query)
print("\n Answer:", result["result"])


 Question: How to fix the error ValueError: The feature names should match those that were passed during fit.

 Answer: Add a check to compare x.columns and feature_names before reindexing. You can also fill missing columns with zeros to ensure all expected features are present. 
