In [1]:
print("OK")

OK


In [2]:
print("Current directory:")
%pwd


Current directory:


'c:\\Projects\\mental health chatbot\\mental health chatbot\\backend\\src\\notebook'

In [3]:
import os 
os.chdir("../")

print("\nProject root directory:")
%pwd


Project root directory:


'c:\\Projects\\mental health chatbot\\mental health chatbot\\backend\\src'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter, TokenTextSplitter
import re

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_pdf_files(data_path):
    """Load all PDF files from the knowledge base directory"""
    loader = DirectoryLoader(
        data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    print(f"✅ Loaded {len(documents)} PDF pages")
    return documents

In [6]:
import os

# Ensure the directory exists before loading files
data_path = "data/knowledge_base"
os.makedirs(data_path, exist_ok=True)

extracted_data = load_pdf_files(data_path)

✅ Loaded 1104 PDF pages


In [7]:
extracted_data

[Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2024-10-28T14:24:53+00:00', 'source': 'data\\knowledge_base\\Nursing-Mental-Health-and-Community-Concepts-Elizabeth-Christman-Kimberly-Ernstmeyer.pdf', 'total_pages': 1104, 'page': 0, 'page_label': '1'}, page_content='Nursing: Mental Health and Community Concepts'),
 Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2024-10-28T14:24:53+00:00', 'source': 'data\\knowledge_base\\Nursing-Mental-Health-and-Community-Concepts-Elizabeth-Christman-Kimberly-Ernstmeyer.pdf', 'total_pages': 1104, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2024-10-28T14:24:53+00:00', 'source': 'data\\knowledge_base\\Nursing-Mental-Health-and-Community-Concepts-Elizabeth-Christman-Kimberly-Ernstmeyer.pdf', 'total_pages': 1104, 'page': 2, 'page_label': '3'}, page_co

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def create_semantic_chunks_mental_health(documents, chunk_size=800, chunk_overlap=150):
    """
    Create semantically coherent chunks optimized for mental health content.
    Uses a sophisticated splitting strategy that preserves context and meaning.
    
    Args:
        documents: List of loaded documents
        chunk_size: Target size for chunks
        chunk_overlap: Amount of overlap between chunks
    
    Returns:
        List of document chunks with metadata
    """
    # Create text splitter with mental health optimized settings
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
            "\n\n\n",    # Section breaks
            "\n\n",      # Paragraphs
            "\n",        # Lines
            ". ",        # Sentences
            "? ",        # Questions
            "! ",        # Exclamations
            "; ",        # Semi-colons
            ", ",        # Clauses
            " ",         # Words
            ""          # Characters
        ],
        keep_separator=True
    )
    
    # Process each document
    chunks = []
    for doc in documents:
        # Extract content and add metadata about section type
        content = doc.page_content
        
        # Check for crisis-related content
        crisis_keywords = ['crisis', 'emergency', 'suicide', 'self-harm', 'urgent', '911', '988']
        contains_crisis = any(keyword in content.lower() for keyword in crisis_keywords)
        
        if contains_crisis:
            # Keep crisis information in a single chunk
            chunks.append(Document(
                page_content=content,
                metadata={
                    **doc.metadata,
                    'chunk_type': 'crisis',
                    'requires_immediate_attention': True
                }
            ))
        else:
            # Split normal content
            doc_chunks = text_splitter.split_text(content)
            
            # Add metadata to each chunk
            for chunk in doc_chunks:
                # Analyze chunk content for categorization
                chunk_lower = chunk.lower()
                is_educational = any(word in chunk_lower for word in ['learn', 'understand', 'overview', 'introduction'])
                is_treatment = any(word in chunk_lower for word in ['treatment', 'therapy', 'medication', 'intervention'])
                is_symptoms = any(word in chunk_lower for word in ['symptom', 'sign', 'experience', 'feel'])
                
                # Determine content type
                if is_educational:
                    content_type = 'educational'
                elif is_treatment:
                    content_type = 'treatment'
                elif is_symptoms:
                    content_type = 'symptoms'
                else:
                    content_type = 'general'
                
                chunks.append(Document(
                    page_content=chunk,
                    metadata={
                        **doc.metadata,
                        'chunk_type': content_type,
                        'requires_immediate_attention': False
                    }
                ))
    
    print(f"✅ Created {len(chunks)} chunks from {len(documents)} documents")
    return chunks


def display_chunk_info(chunks, num_samples=5):
    """Display detailed information about chunks"""
    print(f"\n📊 CHUNK STATISTICS")
    print(f"{'='*80}")
    print(f"Total chunks: {len(chunks)}")
    
    # Calculate chunk size statistics
    chunk_lengths = [len(chunk.page_content) for chunk in chunks]
    print(f"Average chunk size: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
    print(f"Smallest chunk: {min(chunk_lengths)} characters")
    print(f"Largest chunk: {max(chunk_lengths)} characters")
    
    # Count chunk types
    chunk_types = {}
    crisis_chunks = 0
    for chunk in chunks:
        chunk_type = chunk.metadata.get('chunk_type', 'unknown')
        chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
        if chunk.metadata.get('requires_immediate_attention', False):
            crisis_chunks += 1
    
    print(f"\n📑 CHUNK TYPES")
    for chunk_type, count in chunk_types.items():
        print(f"   {chunk_type}: {count} chunks")
    print(f"   Crisis-related chunks: {crisis_chunks}")
    
    print(f"\n📄 SAMPLE CHUNKS")
    print(f"{'='*80}\n")
    
    for i, chunk in enumerate(chunks[:num_samples]):
        print(f"🔹 CHUNK {i+1}")
        print(f"   Source: {chunk.metadata.get('source', 'Unknown')}")
        print(f"   Type: {chunk.metadata.get('chunk_type', 'unknown')}")
        print(f"   Urgent: {chunk.metadata.get('requires_immediate_attention', False)}")
        print(f"   Length: {len(chunk.page_content)} characters")
        print(f"   Content Preview:")
        print(f"   {'-'*76}")
        
        content = chunk.page_content[:300]  # First 300 chars
        lines = content.split('\n')
        for line in lines[:5]:  # First 5 lines
            if line.strip():
                print(f"   {line[:76]}")
        
        print(f"   {'-'*76}\n")


def show_chunk_distribution(chunks):
    """Show distribution of chunks across sources and types"""
    print(f"\n📚 CHUNK DISTRIBUTION")
    print(f"{'='*80}")
    
    from collections import Counter
    
    # Distribution by source
    sources = [chunk.metadata.get('source', 'Unknown') for chunk in chunks]
    source_counts = Counter(sources)
    
    print(f"\n📄 BY SOURCE DOCUMENT:")
    for source, count in source_counts.items():
        print(f"   {source}: {count} chunks")
    
    # Distribution by type
    types = [chunk.metadata.get('chunk_type', 'unknown') for chunk in chunks]
    type_counts = Counter(types)
    
    print(f"\n🏷️ BY CONTENT TYPE:")
    for chunk_type, count in type_counts.items():
        print(f"   {chunk_type}: {count} chunks")
    
    # Crisis chunks
    crisis_count = sum(1 for c in chunks if c.metadata.get('requires_immediate_attention', False))
    print(f"\n🚨 CRISIS INFORMATION:")
    print(f"   Chunks marked as urgent: {crisis_count}")


# ==============================================================================
# EXECUTE CHUNKING
# ==============================================================================

print("🔄 Starting mental health-optimized chunking...\n")

# Create chunks with optimized parameters
document_chunks = create_semantic_chunks_mental_health(
    extracted_data,
    chunk_size=800,      # Larger chunks for better context
    chunk_overlap=150    # Good overlap for coherence
)

# Display chunk information
display_chunk_info(document_chunks, num_samples=5)

# Show distribution
show_chunk_distribution(document_chunks)

print("\n✅ Chunking complete! Ready for embedding generation.")

# ==============================================================================
# OPTIONAL: Save chunks for later use
# ==============================================================================

# Uncomment if you want to save the chunks
# import pickle
# with open('mental_health_chunks.pkl', 'wb') as f:
#     pickle.dump(document_chunks, f)
# print("💾 Chunks saved to 'mental_health_chunks.pkl'")

🔄 Starting mental health-optimized chunking...

✅ Created 2340 chunks from 1104 documents

📊 CHUNK STATISTICS
Total chunks: 2340
Average chunk size: 746 characters
Smallest chunk: 18 characters
Largest chunk: 2917 characters

📑 CHUNK TYPES
   general: 759 chunks
   educational: 282 chunks
   crisis: 268 chunks
   symptoms: 432 chunks
   treatment: 599 chunks
   Crisis-related chunks: 268

📄 SAMPLE CHUNKS

🔹 CHUNK 1
   Source: data\knowledge_base\Nursing-Mental-Health-and-Community-Concepts-Elizabeth-Christman-Kimberly-Ernstmeyer.pdf
   Type: general
   Urgent: False
   Length: 45 characters
   Content Preview:
   ----------------------------------------------------------------------------
   Nursing: Mental Health and Community Concepts
   ----------------------------------------------------------------------------

🔹 CHUNK 2
   Source: data\knowledge_base\Nursing-Mental-Health-and-Community-Concepts-Elizabeth-Christman-Kimberly-Ernstmeyer.pdf
   Type: general
   Urgent: False
   Lengt

In [9]:
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
    
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs
    )
    return embeddings

print("\n🔄 Loading HuggingFace embeddings model...")
embedding_model = download_embeddings()
print("✅ Embedding model loaded successfully.")
print("\n--- Pipeline Complete ---")
print("Variables available: 'extracted_data', 'document_chunks', 'embedding_model'")


🔄 Loading HuggingFace embeddings model...


  embeddings = HuggingFaceEmbeddings(


✅ Embedding model loaded successfully.

--- Pipeline Complete ---
Variables available: 'extracted_data', 'document_chunks', 'embedding_model'


In [10]:
embedding_model


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
vector = embedding_model.embed_query("Hello World")

In [12]:
vector

[-0.03447727486491203,
 0.03102312609553337,
 0.006734980270266533,
 0.026108933612704277,
 -0.03936205804347992,
 -0.16030246019363403,
 0.06692394614219666,
 -0.006441438104957342,
 -0.047450482845306396,
 0.014758863486349583,
 0.07087534666061401,
 0.05552757531404495,
 0.019193356856703758,
 -0.02625126577913761,
 -0.01010954286903143,
 -0.026940442621707916,
 0.022307462990283966,
 -0.02222665585577488,
 -0.14969263970851898,
 -0.017493024468421936,
 0.007676282897591591,
 0.054352231323719025,
 0.0032544038258492947,
 0.03172588348388672,
 -0.08462139964103699,
 -0.029405992478132248,
 0.051595550030469894,
 0.048124078661203384,
 -0.003314835485070944,
 -0.05827915295958519,
 0.04196925833821297,
 0.022210702300071716,
 0.1281888633966446,
 -0.022338951006531715,
 -0.011656239628791809,
 0.06292837113142014,
 -0.03287634998559952,
 -0.09122604131698608,
 -0.03117534890770912,
 0.052699536085128784,
 0.04703483358025551,
 -0.08420310169458389,
 -0.030056182295084,
 -0.0207448396

In [13]:
len(vector)

384

In [14]:
print("\n🔄 Vectorizing document chunks...")

# The embed_documents method takes a list of strings and returns a list of vectors.
# We extract the page_content from each chunk to create the list of texts.
chunk_texts = [chunk.page_content for chunk in document_chunks]
chunk_vectors = embedding_model.embed_documents(chunk_texts)


🔄 Vectorizing document chunks...


In [15]:
print(f"\n🔬 DISPLAYING FIRST 3 FULL VECTORS")
print(f"{'='*80}")

# Ensure we don't try to display more vectors than we have
# WARNING: This will produce a lot of output. Displaying 3 samples.
num_to_display = min(3, len(chunk_vectors))

if num_to_display > 0:
    # Get the vector dimension from the first vector
    vector_dim = len(chunk_vectors[0])
    
    for i in range(num_to_display):
        # Get the corresponding text chunk and the full vector
        text_chunk = document_chunks[i].page_content
        full_vector = chunk_vectors[i]
        
        print(f"\n--- Vector {i+1} ---")
        print(f"Corresponding Text: '{text_chunk[:80].replace(chr(10), ' ')}...'")
        print(f"Full Vector (all {vector_dim} dimensions):")
        print(full_vector)
else:
    print("No vectors to display.")

print(f"\n{'-'*80}")
print("\n--- Pipeline Complete ---")
print("All steps executed successfully.")


🔬 DISPLAYING FIRST 3 FULL VECTORS

--- Vector 1 ---
Corresponding Text: 'Nursing: Mental Health and Community Concepts...'
Full Vector (all 384 dimensions):
[0.0959155485033989, -0.03631218895316124, -0.11650706082582474, 0.08614123612642288, -0.05868690833449364, 0.030680939555168152, 0.023577218875288963, -0.09857787191867828, -0.0303410105407238, -0.03617561236023903, 0.01780286245048046, -0.005508528556674719, -0.10223645716905594, 0.005977744236588478, -0.0029672905802726746, -0.023380300030112267, -0.025420064106583595, -0.0006149439141154289, 0.032046105712652206, 0.11525987088680267, -0.06891768425703049, 0.021203182637691498, -0.018917573615908623, 0.0873158723115921, 0.021387264132499695, 0.116522878408432, 0.0015850603813305497, -0.04018966481089592, 0.050223205238580704, 0.10808158665895462, -0.010606200434267521, 0.04600377008318901, 0.060310669243335724, 0.07644859701395035, 0.03514194115996361, 0.024871826171875, -0.008428707718849182, 0.0619034618139267, -0.00087323616

In [16]:

# Check if the number of vectors matches the number of chunks
if len(document_chunks) == len(chunk_vectors):
    print("✅ The number of vectors matches the number of text chunks.")
else:
    print("❌ MISMATCH: The number of vectors does NOT match the number of text chunks.")

if chunk_vectors:
    # Check the dimensionality of the vectors
    vector_dim = len(chunk_vectors[0])
    print(f"Vector dimensionality (the length of each vector): {vector_dim}")
    
    # Print a small sample of the first vector
    print("\n🔬 Sample of the first vector (showing first 10 numbers):")
    print(chunk_vectors[0][:10])
    
print(f"{'='*80}")

✅ The number of vectors matches the number of text chunks.
Vector dimensionality (the length of each vector): 384

🔬 Sample of the first vector (showing first 10 numbers):
[0.0959155485033989, -0.03631218895316124, -0.11650706082582474, 0.08614123612642288, -0.05868690833449364, 0.030680939555168152, 0.023577218875288963, -0.09857787191867828, -0.0303410105407238, -0.03617561236023903]


In [17]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [18]:
CHROMA_API_KEY=os.getenv("CHROMA_API_KEY")
GEMINI_API_KEY=os.getenv("GEMINI_API_KEY")

os.environ["CHROMA_API_KEY"] = CHROMA_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY


In [19]:
from langchain_chroma import Chroma

CHROMA_DB_PATH = "data/chroma_db"

print(f"🔄 Creating ChromaDB with {len(document_chunks)} chunks...")
print(f"⏳ This will take 10-20 minutes. Please wait...\n")

# CREATE the database (not just load it)
vectordb = Chroma.from_documents(
    documents=document_chunks,
    embedding=embedding_model,
    persist_directory=CHROMA_DB_PATH,
    collection_name="mental_health"
)

print(f"\n✅ Database CREATED successfully!")
print(f"📊 Documents stored: {vectordb._collection.count()}")

🔄 Creating ChromaDB with 2340 chunks...
⏳ This will take 10-20 minutes. Please wait...


✅ Database CREATED successfully!
📊 Documents stored: 22116

✅ Database CREATED successfully!
📊 Documents stored: 22116


In [20]:
from langchain_community.vectorstores import Chroma

CHROMA_DB_PATH = "data/chroma_db"

# Load the existing database
vectordb = Chroma(
    persist_directory=CHROMA_DB_PATH,
    embedding_function=embedding_model,
    collection_name="mental_health"
)

  vectordb = Chroma(


In [21]:
print(f"📊 Number of documents in database: {vectordb._collection.count()}")


📊 Number of documents in database: 22116


In [40]:
results = vectordb.similarity_search("i feel lonely and useless", k=3)
print(f"\n🔍 Sample search results:")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.page_content[:10000]}...")


🔍 Sample search results:
1. phone calls or video chats can help you and your loved ones feel less
lonely or isolated. Connect with your community or faith-based
organizations.
• Use the techniques described in the...
2. Social 
Isolation 
Aloneness experienced by the 
individual and perceived as imposed by 
others and as a negative or threatening 
state. 
• Absence of support 
system 
• Flat or sad affect 
• Feeling different from 
others 
• Values incongruent 
with social norms 
Chronic Low 
Self-Esteem 
Negative evaluation and/or feelings 
about one’s own capabilities, lasting at 
least three months. • Repeatedly 
unsuccessful in life 
events 
• Underestimates 
ability to deal with 
situation...
3. Social 
Isolation 
Aloneness experienced by the 
individual and perceived as imposed by 
others and as a negative or threatening 
state. 
• Absence of support 
system 
• Flat or sad affect 
• Feeling different from 
others 
• Values incongruent 
with social norms 
Chronic Low 
Self-Esteem