In [1]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS








In [4]:
# ------------------------------
# Step 1: Build Vector Store from Documents
# ------------------------------

# Use HuggingFaceEmbeddings with a lightweight model (no API key needed)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Specify the directory containing your text files
data_dir = "./Big Star Collectibles"
files = os.listdir(data_dir)
documents = []

for file in files:
    file_path = os.path.join(data_dir, file)
    # Skip items that are not files (directories, etc.)
    if not os.path.isfile(file_path):
        continue

    # Open file with errors ignored to avoid UnicodeDecodeError
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read().strip()
    print(f"Reading {file_path}, length: {len(text)} characters")
    if not text:
        print(f"Warning: {file_path} is empty.")
        continue

    # Split text into manageable chunks
    text_splitter = CharacterTextSplitter(chunk_size=128, chunk_overlap=32, separator="\n")
    chunks = text_splitter.split_text(text)
    print(f"Got {len(chunks)} chunks from {file_path}")
    
    # Wrap each chunk into a Document object
    for chunk in chunks:
        documents.append(Document(page_content=chunk))

if not documents:
    raise ValueError("No text chunks found. Please check your file reading and splitting logic.")


Created a chunk of size 1498, which is longer than the specified 128
Created a chunk of size 2516, which is longer than the specified 128
Created a chunk of size 369, which is longer than the specified 128
Created a chunk of size 177, which is longer than the specified 128
Created a chunk of size 163, which is longer than the specified 128
Created a chunk of size 129, which is longer than the specified 128
Created a chunk of size 184, which is longer than the specified 128
Created a chunk of size 409, which is longer than the specified 128
Created a chunk of size 164, which is longer than the specified 128
Created a chunk of size 175, which is longer than the specified 128
Created a chunk of size 191, which is longer than the specified 128
Created a chunk of size 137, which is longer than the specified 128
Created a chunk of size 455, which is longer than the specified 128
Created a chunk of size 154, which is longer than the specified 128
Created a chunk of size 180, which is longer t

Reading ./Big Star Collectibles/.DS_Store, length: 6065 characters
Got 4 chunks from ./Big Star Collectibles/.DS_Store
Reading ./Big Star Collectibles/Product Mantras.txt, length: 5291 characters
Got 40 chunks from ./Big Star Collectibles/Product Mantras.txt
Reading ./Big Star Collectibles/Disclaimer.txt, length: 428 characters
Got 1 chunks from ./Big Star Collectibles/Disclaimer.txt
Reading ./Big Star Collectibles/Our Story.txt, length: 875 characters
Got 3 chunks from ./Big Star Collectibles/Our Story.txt
Reading ./Big Star Collectibles/FAQ.txt, length: 3162 characters
Got 20 chunks from ./Big Star Collectibles/FAQ.txt
Reading ./Big Star Collectibles/Our Team.txt, length: 1539 characters
Got 4 chunks from ./Big Star Collectibles/Our Team.txt
Reading ./Big Star Collectibles/Careers.txt, length: 1320 characters
Got 10 chunks from ./Big Star Collectibles/Careers.txt
Reading ./Big Star Collectibles/Contact Us.txt, length: 256 characters
Got 1 chunks from ./Big Star Collectibles/Contact U

In [5]:
# Create a FAISS vector store from the documents
vector_store = FAISS.from_documents(documents, embedding=embeddings)
print("Vector store created successfully!")

# Create a retriever from the vector store
retriever = vector_store.as_retriever()

Vector store created successfully!


In [6]:
# ------------------------------
# Step 2: Setup the Prompt and LLM
# ------------------------------

from langchain.prompts import ChatPromptTemplate

template = """You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# Define a Hugging Face LLM using a transformers pipeline
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# Use GPT-2, but increase max_length and set max_new_tokens to allow for additional output.
# You can adjust these parameters as needed.
pipeline_llm = pipeline(
    "text-generation", 
    model="gpt2", 
    max_length=200,          # Increase overall max_length
    max_new_tokens=50,       # Allow 50 new tokens to be generated
    truncation=True          # Explicitly enable truncation
)
llm = HuggingFacePipeline(pipeline=pipeline_llm)

Device set to use mps:0
  llm = HuggingFacePipeline(pipeline=pipeline_llm)


In [7]:
# We'll also import the output parser to clean up the result
from langchain_core.output_parsers import StrOutputParser

# ------------------------------
# Step 3: Retrieve Context and Generate an Answer
# ------------------------------

def get_answer(question: str) -> str:
    # Retrieve relevant documents for the question
    docs = retriever.get_relevant_documents(question)
    context = "\n".join(doc.page_content for doc in docs)
    # Format the prompt with the question and retrieved context
    formatted_prompt = prompt.format(question=question, context=context)
    # Generate the LLM output
    llm_output = llm(formatted_prompt)
    # Parse the output to get a clean string answer
    answer = StrOutputParser().parse(llm_output)
    return answer


In [8]:

# ------------------------------
# Example Usage
# ------------------------------

question = "What is Big Star Collectibles about?"
result = get_answer(question)
print("\nChain output:")
print(result)


  docs = retriever.get_relevant_documents(question)
  llm_output = llm(formatted_prompt)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Chain output:
Human: You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is Big Star Collectibles about? 
Context: What are Big Star Collectibles products printed on?
From the start, Big Star Collectibles has been about quality rather than quantity. We strive to give our customers the absolute best of the best, and we also acknowledge that creating limited editions enhances and adds to the value of each product, now and in the future.
Big Star Collectibles has grown over the years to include memorabilia, contests, events, appraisals, and consultation services.
For a fee, our experts can assist you in finding a particular Big Star Collectibles item that you have been looking for. Big Star Collectibles can also broker sales and trades among our customers. 
Answer:
Big Star Collectibles is committed to deliverin