In [3]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("murderofroger.txt", encoding='utf-8')  # or try 'latin-1' if utf-8 doesn't work
text_documents = loader.load()
text_documents



In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(text_documents)
documents[:5]

[]

In [22]:
# First, let's check if we actually have content in text_documents
print("Length of text:", len(text_documents))
print("First few characters:", text_documents[0].page_content[:100])

# Try adjusting the text splitter parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)
documents = text_splitter.split_documents(text_documents)
print("Number of chunks created:", len(documents))

Length of text: 1
First few characters: 
Number of chunks created: 0


In [20]:
documents

[]

In [23]:
# Let's try reading the file directly first to check its contents
with open("murderofroger.txt", 'r', encoding='utf-8') as f:
    content = f.read()
    print("Raw file content length:", len(content))
    print("First 200 characters:", content[:200])

# Then try the TextLoader again with explicit encoding
loader = TextLoader("murderofroger.txt", encoding='utf-8-sig')  # utf-8-sig handles BOM if present
text_documents = loader.load()
print("\nDocument content length:", len(text_documents[0].page_content))

Raw file content length: 393931
First 200 characters: 
Agatha Christie
The Murder of Roger Ackroyd
A Hercule Poirot Mystery
To Punkie who likes an orthodox detective story, murder, inquest, and
suspicion falling on everyone in turn!
Contents
Cover
Title 

Document content length: 393931


In [24]:
# Adjust text splitter with more appropriate parameters for a book
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Increased chunk size
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],  # Explicit separators
    length_function=len
)
documents = text_splitter.split_documents(text_documents)
print(f"Number of chunks created: {len(documents)}")
if documents:  # Let's verify the content of first chunk
    print("\nFirst chunk preview:")
    print(documents[0].page_content[:200])

Number of chunks created: 219

First chunk preview:
Agatha Christie
The Murder of Roger Ackroyd
A Hercule Poirot Mystery
To Punkie who likes an orthodox detective story, murder, inquest, and
suspicion falling on everyone in turn!
Contents
Cover
Title P


In [25]:
documents

[Document(metadata={'source': 'murderofroger.txt'}, page_content='Agatha Christie\nThe Murder of Roger Ackroyd\nA Hercule Poirot Mystery\nTo Punkie who likes an orthodox detective story, murder, inquest, and\nsuspicion falling on everyone in turn!\nContents\nCover\nTitle Page\nDedication\n1. Dr. Sheppard at the Breakfast Table\n2. Who’s Who in King’s Abbot\n3. The Man Who Grew Vegetable Marrows\n4. Dinner at Fernly\n5. Murder\n6. The Tunisian Dagger\n7. I Learn My Neighbour’s Profession\n8. Inspector Raglan is Confident\n9. The Goldfish Pond\n10. The Parlourmaid\n11. Poirot Pays a Call\n12. Round the Table\n13. The Goose Quill\n14. Mrs. Ackroyd\n15. Geoffrey Raymond\n16. An Evening at Mah Jong\n17. Parker 189\n18. Charles Kent\n19. Flora Ackroyd\n20. Miss Russell\n21. The Paragraph in the Paper\n22. Ursula’s Story\n23. Poirot’s Little Reunion\n24. Ralph Paton’s Story\n25. The Whole Truth\n26. And Nothing But The Truth\n27. Apologia\nAbout the Author\nOther Books by Agatha Christie\nCop

In [27]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")
model = AutoModel.from_pretrained("BAAI/bge-small-en")

# Function to create embeddings for a batch of texts
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [31]:
import faiss
import numpy as np

# Sample texts for embedding
documents = [str(doc) for doc in documents]
# Generate embeddings
embeddings = get_embeddings(documents).cpu().numpy()

# Initialize FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

# Add embeddings to the index
index.add(embeddings)

# Save index (optional)
faiss.write_index(index, "faiss_index.bin")


In [39]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Load the BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # BERT base uncased
model = AutoModel.from_pretrained("bert-base-uncased")

# Function to create embeddings for a batch of texts using BERT
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling: average over the tokens in the sequence (ignoring padding)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Load your document
loader = TextLoader("murderofroger.txt", encoding='utf-8')  # or try 'latin-1' if utf-8 doesn't work
text_documents = loader.load()

# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Increased chunk size
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],  # Explicit separators
    length_function=len
)
documents = text_splitter.split_documents(text_documents)

# Convert the documents into a list of text strings
documents = [doc.page_content for doc in documents]

# Generate embeddings for the document chunks
embeddings = get_embeddings(documents).cpu().numpy()

# Initialize FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index (optional)
faiss.write_index(index, "faiss_index.bin")

# Function to query the database and retrieve the most relevant document chunks



KeyboardInterrupt: 

In [42]:
def query_db(query, index, documents, k=5):
    query_embedding = get_embeddings([query]).cpu().numpy()
    D, I = index.search(query_embedding, k=k)  # Retrieve top k matches
    
    results = []
    for idx in I[0]:
        # Shorten the output by extracting the first 200 characters and highlighting keywords
        snippet = documents[idx]
        highlighted = snippet.replace(query, f"**{query}**")  # Basic highlight by query term
        results.append(highlighted)
    
    return results

# Example query
query = "What was Ursula’s reaction to her father’s death?"

result_snippets = query_db(query, index, documents)

# Display the results
for snippet in result_snippets:
    print(f"Result: {snippet}")

Result: “He was poisoned,” said Ackroyd.
He spoke in a dull heavy voice.
“Who by?” I asked sharply.
“His wife.”
“How do you know that?”
“She told me so herself.”
“When?”
“Yesterday! My God! yesterday! It seems ten years ago.”
I waited a minute, then he went on.
“You understand, Sheppard, I’m telling you this in confidence. It’s to go
no further. I want your advice—I can’t carry the whole weight by myself.
As I said just now, I don’t know what to do.”
“Can you tell me the whole story?” I said. “I’m still in the dark. How
did Mrs. Ferrars come to make this confession to you?”
“It’s like this. Three months ago I asked Mrs. Ferrars to marry me. She
refused. I asked her again and she consented, but she refused to allow me to
make the engagement public until her year of mourning was up. Yesterday I
called upon her, pointed out that a year and three weeks had now elapsed
since her husband’s death, and that there could be no further objection to
making the engagement public property. I had not