In [1]:
!pip install -q sentence-transformers boto3 pandas numpy scikit-learn

In [2]:
import json
import boto3
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from sklearn.metrics.pairwise import cosine_similarity

s3 = boto3.client('s3')
bucket_name = 'medical-rag-data-b01015847' 

print("Environment setup complete!")

Environment setup complete!


## Loading the Data Function

In [3]:
def load_medical_data_from_s3():
    """Load all JSON files from S3 and create structured chunks"""
    
    print("loading medical data from S3...")
    
    response = s3.list_objects_v2(
        Bucket=bucket_name,
        Prefix='raw-data/'
    )
    
    documents = []
    file_count = 0
    
    if 'Contents' not in response:
        print("no files found in raw-data folder!")
        return documents
    
    for obj in response['Contents']:
        if obj['Key'].endswith('.json'):
            try:
                file_response = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
                data = json.loads(file_response['Body'].read().decode('utf-8'))
                
                articles_to_process = []
                
                if isinstance(data, list):
                    articles_to_process = data
                    print(f"Found list with {len(data)} articles in {obj['Key']}")
                elif isinstance(data, dict) and 'name' in data:
                    articles_to_process = [data]
                else:
                    print(f"Skipping {obj['Key']} - unknown format")
                    continue
                
                for article_data in articles_to_process:
                    if not isinstance(article_data, dict):
                        continue
                        
                    filename = obj['Key'].split('/')[-1].replace('.json', '')
                    doc_id = article_data.get('name', filename).replace(' ', '_').lower()
                    
                    sections = ['overview', 'symptoms', 'causes', 'diagnosis', 
                               'treatment', 'prognosis', 'prevention', 'complications']
                    
                    for section in sections:
                        if section in article_data and article_data[section] and str(article_data[section]).strip():
                            chunk = {
                                'doc_id': doc_id,
                                'section': section,
                                'content': str(article_data[section]).strip(),
                                'url': article_data.get('url', ''),
                                'title': article_data.get('name', filename),
                                'chunk_id': f"{doc_id}_{section}"
                            }
                            documents.append(chunk)
                    
                    combined_content = ' '.join([
                        str(article_data.get(s, '')) for s in sections 
                        if article_data.get(s) and str(article_data.get(s)).strip()
                    ])
                    
                    if combined_content.strip():
                        documents.append({
                            'doc_id': doc_id,
                            'section': 'full_article',
                            'content': combined_content.strip(),
                            'url': article_data.get('url', ''),
                            'title': article_data.get('name', filename),
                            'chunk_id': f"{doc_id}_full"
                        })
                    
                    file_count += 1
                    if file_count % 500 == 0:
                        print(f"Processed {file_count} articles...")
                        
            except Exception as e:
                print(f"Error processing {obj['Key']}: {str(e)}")
                continue
    
    print(f" Loaded {file_count} articles, created {len(documents)} chunks")
    return documents

medical_documents = load_medical_data_from_s3()

loading medical data from S3...
Found list with 4203 articles in raw-data/all_articles.json
Processed 500 articles...
Processed 1000 articles...
Processed 1500 articles...
Processed 2000 articles...
Processed 2500 articles...
Processed 3000 articles...
Processed 3500 articles...
Processed 4000 articles...
Processed 4500 articles...
Processed 5000 articles...
 Loaded 5201 articles, created 24910 chunks


In [4]:
if medical_documents:
    print(f" Data Analysis:")
    print(f"Total chunks: {len(medical_documents)}")
    
    section_counts = {}
    for doc in medical_documents:
        section = doc['section']
        section_counts[section] = section_counts.get(section, 0) + 1
    
    print(f"\nChunks by section:")
    for section, count in sorted(section_counts.items()):
        print(f"  {section}: {count}")

    print(f"\n Sample document chunk:")
    sample = medical_documents[0]
    print(f"Title: {sample['title']}")
    print(f"Section: {sample['section']}")
    print(f"Content preview: {sample['content'][:200]}...")
    
else:
    print(" No documents loaded!")

 Data Analysis:
Total chunks: 24910

Chunks by section:
  causes: 3144
  complications: 2564
  diagnosis: 2020
  full_article: 5178
  overview: 4604
  prevention: 913
  symptoms: 2818
  treatment: 3669

 Sample document chunk:
Title: A1C test
Section: overview
Content preview: A1C is a lab test that shows the average level of blood sugar (glucose) over the previous 3 months. It shows how well you are controlling your blood sugar to help prevent complications from diabetes....


## Checking S3 contents

In [5]:
response = s3.list_objects_v2(Bucket=bucket_name, Prefix='raw-data/')

json_files = []
for obj in response['Contents']:
    if obj['Key'].endswith('.json'):
        json_files.append({
            'filename': obj['Key'],
            'size': obj['Size']
        })

print(f" Files in S3 bucket:")
print(f"Total JSON files: {len(json_files)}")

json_files_sorted = sorted(json_files, key=lambda x: x['size'], reverse=True)

print(f"\n Largest files (likely the combined file):")
for file in json_files_sorted[:5]:
    size_mb = file['size'] / (1024 * 1024)
    print(f"  {file['filename']}: {size_mb:.2f} MB")

print(f"\n First 10 files:")
for file in json_files[:10]:
    print(f"  {file['filename']}")

 Files in S3 bucket:
Total JSON files: 999

 Largest files (likely the combined file):
  raw-data/all_articles.json: 6.34 MB
  raw-data/congenital_heart_defect_-_corrective_surgery.json: 0.01 MB
  raw-data/aging_changes_in_the_heart_and_blood_vessels.json: 0.01 MB
  raw-data/covid-19_vaccine_-_what_you_need_to_know.json: 0.01 MB
  raw-data/communicating_with_patients.json: 0.01 MB

 First 10 files:
  raw-data/a1c_test.json
  raw-data/a_guide_to_clinical_trials_for_cancer.json
  raw-data/a_guide_to_help_children_understand_cancer.json
  raw-data/a_guide_to_herbal_remedies.json
  raw-data/aarskog_syndrome.json
  raw-data/aase_syndrome.json
  raw-data/abdomen_-_swollen.json
  raw-data/abdominal_aortic_aneurysm.json
  raw-data/abdominal_aortic_aneurysm_repair_-_open.json
  raw-data/abdominal_aortic_aneurysm_repair_-_open_-_discharge.json


## Loading Clean Data

In [6]:
def load_from_combined_file_only():
    """Load only from the combined all_articles.json file"""
    
    print("Loading from combined file only...")
    
    try:
        file_response = s3.get_object(Bucket=bucket_name, Key='raw-data/all_articles.json')
        articles_list = json.loads(file_response['Body'].read().decode('utf-8'))
        
        print(f"Found {len(articles_list)} articles in combined file")
        
        documents = []
        
        for i, article_data in enumerate(articles_list):
            if not isinstance(article_data, dict):
                continue

            doc_id = article_data.get('name', f'article_{i}').replace(' ', '_').lower()

            sections = ['overview', 'symptoms', 'causes', 'diagnosis', 
                       'treatment', 'prognosis', 'prevention', 'complications']
            
            for section in sections:
                if section in article_data and article_data[section] and str(article_data[section]).strip():
                    chunk = {
                        'doc_id': doc_id,
                        'section': section,
                        'content': str(article_data[section]).strip(),
                        'url': article_data.get('url', ''),
                        'title': article_data.get('name', f'Article {i}'),
                        'chunk_id': f"{doc_id}_{section}"
                    }
                    documents.append(chunk)
            
            if (i + 1) % 1000 == 0:
                print(f"Processed {i + 1} articles...")
        
        print(f"Created {len(documents)} chunks from {len(articles_list)} articles")
        return documents
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return []

medical_documents_clean = load_from_combined_file_only()

Loading from combined file only...
Found 4203 articles in combined file
Processed 1000 articles...
Processed 2000 articles...
Processed 3000 articles...
Processed 4000 articles...
Created 15906 chunks from 4203 articles


## Analyzing Clean Data

In [7]:
if medical_documents_clean:
    print(f"Clean Data Analysis:")
    print(f"Total articles: 4203")
    print(f"Total chunks: {len(medical_documents_clean)}")
    print(f"Average chunks per article: {len(medical_documents_clean)/4203:.1f}")
    
    section_counts = {}
    for doc in medical_documents_clean:
        section = doc['section']
        section_counts[section] = section_counts.get(section, 0) + 1
    
    print(f"\n Chunks by section:")
    for section, count in sorted(section_counts.items()):
        print(f"  {section}: {count:,}")
    
    print(f"\n Sample document chunk:")
    sample = medical_documents_clean[0]
    print(f"Title: {sample['title']}")
    print(f"Section: {sample['section']}")
    print(f"Content length: {len(sample['content'])} characters")
    print(f"Content preview: {sample['content'][:200]}...")
    
    processed_data = json.dumps(medical_documents_clean, indent=2)
    s3.put_object(
        Bucket='medical-rag-processed-b01015847', 
        Key='processed_chunks.json',
        Body=processed_data
    )
    print(f"\n Saved processed chunks to S3!")
    
else:
    print(" No clean documents loaded!")

Clean Data Analysis:
Total articles: 4203
Total chunks: 15906
Average chunks per article: 3.8

 Chunks by section:
  causes: 2,537
  complications: 2,063
  diagnosis: 1,632
  overview: 3,700
  prevention: 736
  symptoms: 2,282
  treatment: 2,956

 Sample document chunk:
Title: A guide to clinical trials for cancer
Section: overview
Content length: 348 characters
Content preview: If you havecancer, a clinical trial may be an option for managing your cancer. A clinical trial is a study using people who agree to participate in new tests or treatments. Clinical trials help resear...

 Saved processed chunks to S3!


## Generating Embeddings

In [8]:
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")

def generate_embeddings_batch(documents, batch_size=100):
    """Generate embeddings for documents in batches"""
    
    print(f"Generating embeddings for {len(documents)} chunks...")
    
    embeddings_data = []
    
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        
        batch_content = [doc['content'] for doc in batch]

        batch_embeddings = model.encode(batch_content, show_progress_bar=False)
        
        for j, doc in enumerate(batch):
            embedding_record = {
                'chunk_id': doc['chunk_id'],
                'doc_id': doc['doc_id'],
                'section': doc['section'],
                'title': doc['title'],
                'content': doc['content'],
                'url': doc['url'],
                'embedding': batch_embeddings[j].tolist() 
            }
            embeddings_data.append(embedding_record)

        processed = min(i + batch_size, len(documents))
        if processed % 1000 == 0 or processed == len(documents):
            print(f"  Generated embeddings for {processed:,}/{len(documents):,} chunks")
    
    return embeddings_data

embeddings_data = generate_embeddings_batch(medical_documents_clean)

print(f" Generated {len(embeddings_data)} embeddings")
print(f" Embedding dimension: {len(embeddings_data[0]['embedding'])}")

Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!
Generating embeddings for 15906 chunks...
  Generated embeddings for 1,000/15,906 chunks
  Generated embeddings for 2,000/15,906 chunks
  Generated embeddings for 3,000/15,906 chunks
  Generated embeddings for 4,000/15,906 chunks
  Generated embeddings for 5,000/15,906 chunks
  Generated embeddings for 6,000/15,906 chunks
  Generated embeddings for 7,000/15,906 chunks
  Generated embeddings for 8,000/15,906 chunks
  Generated embeddings for 9,000/15,906 chunks
  Generated embeddings for 10,000/15,906 chunks
  Generated embeddings for 11,000/15,906 chunks
  Generated embeddings for 12,000/15,906 chunks
  Generated embeddings for 13,000/15,906 chunks
  Generated embeddings for 14,000/15,906 chunks
  Generated embeddings for 15,000/15,906 chunks
  Generated embeddings for 15,906/15,906 chunks
 Generated 15906 embeddings
 Embedding dimension: 384


## Saving embeddings and creating search function

In [9]:
print("Saving embeddings to S3...")

embeddings_json = json.dumps(embeddings_data, indent=2)
s3.put_object(
    Bucket='medical-rag-processed-b01015847',
    Key='medical_embeddings.json',
    Body=embeddings_json
)

import pickle
with open('/tmp/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_data, f)

s3.upload_file('/tmp/embeddings.pkl', 'medical-rag-processed-b01015847', 'medical_embeddings.pkl')

print("Embeddings saved to S3!")

def search_medical_information(query, top_k=5):
    """Search for relevant medical information using semantic similarity"""
    
    print(f"Searching for: '{query}'")

    query_embedding = model.encode([query])[0]
    
    similarities = []
    
    for i, embedding_record in enumerate(embeddings_data):
        chunk_embedding = np.array(embedding_record['embedding'])
        similarity = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
        
        similarities.append({
            'score': similarity,
            'title': embedding_record['title'],
            'section': embedding_record['section'],
            'content': embedding_record['content'],
            'url': embedding_record['url']
        })
    
    similarities.sort(key=lambda x: x['score'], reverse=True)
    
    return similarities[:top_k]

print("Search function is ready")

Saving embeddings to S3...
Embeddings saved to S3!
Search function is ready


## Testing the RAG System

In [10]:
test_queries = [
    "What are the symptoms of diabetes?",
    "How to treat high blood pressure?",
    "What causes heart disease?",
    "Chest pain symptoms",
    "How to prevent stroke?"
]

print("Testing Medical RAG System")
print("=" * 50)

for query in test_queries:
    print(f"\nQuery: {query}")
    print("-" * 40)
    
    results = search_medical_information(query, top_k=3)
    
    for i, result in enumerate(results, 1):
        print(f"\n{i}. {result['title']} ({result['section']})")
        print(f"   Relevance: {result['score']:.3f}")
        print(f"   Content: {result['content'][:150]}...")
        if result['url']:
            print(f"   Source: {result['url']}")
    
    print("\n" + "="*50)

Testing Medical RAG System

Query: What are the symptoms of diabetes?
----------------------------------------
Searching for: 'What are the symptoms of diabetes?'

1. Managing your blood sugar (symptoms)
   Relevance: 0.685
   Content: Symptoms of this condition include:

• The time of day
• Your blood sugar level
• The amount of carbohydrates you ate
• The type and dose of your diab...
   Source: https://medlineplus.gov/ency/patientinstructions/000086.htm

2. Diabetes (symptoms)
   Relevance: 0.656
   Content: Symptoms of this condition include:

• Blurry vision
• Excess thirst
• Fatigue
• Frequent urination
• Hunger
• Weight loss...
   Source: https://medlineplus.gov/ency/article/001214.htm

3. Type 2 diabetes - self-care (symptoms)
   Relevance: 0.647
   Content: Symptoms of this condition include:

• Hunger
• Thirst
• Urinating a lot, getting up more often than usual at night to urinate
• Blurry vision
• More ...
   Source: https://medlineplus.gov/ency/patientinstructions/000328.ht

## Interactive search function

In [11]:
def interactive_medical_search():
    """Interactive search function for testing"""
    
    print("Medical Information RAG System")
    print("Ask questions about medical conditions, symptoms, treatments, etc.")
    print("Type 'quit' to exit")
    print("-" * 50)
    
    while True:
        user_query = input("\nEnter your medical question: ").strip()
        
        if user_query.lower() in ['quit', 'exit', 'q']:
            print("Thank you for using the Medical RAG System!")
            break
        
        if not user_query:
            print("Please enter a valid question.")
            continue
        
        try:
            results = search_medical_information(user_query, top_k=3)
            
            print(f"\nResults for: '{user_query}'")
            print("-" * 40)
            
            for i, result in enumerate(results, 1):
                print(f"\n{i}.{result['title']}")
                print(f" Section: {result['section']}")
                print(f" Relevance: {result['score']:.3f}")
                print(f" Content: {result['content'][:200]}...")
                if result['url']:
                    print(f" Source: {result['url']}")
            
        except Exception as e:
            print(f"Error: {str(e)}")

# uncomment the line below to start interactive mode
interactive_medical_search()

Medical Information RAG System
Ask questions about medical conditions, symptoms, treatments, etc.
Type 'quit' to exit
--------------------------------------------------



Enter your medical question:  quit


Thank you for using the Medical RAG System!


## System performance analysis

In [12]:
import time

def analyze_system_performance():
    """Analyze search performance and accuracy"""
    
    print("System Performance Analysis")
    print("=" * 40)
    
    test_query = "diabetes symptoms treatment"
    
    start_time = time.time()
    results = search_medical_information(test_query, top_k=5)
    search_time = time.time() - start_time
    
    print(f"Search Speed:")
    print(f" Query: '{test_query}'")
    print(f" Time: {search_time:.3f} seconds")
    print(f" Chunks searched: {len(embeddings_data):,}")
    print(f" Speed: {len(embeddings_data)/search_time:.0f} chunks/second")
    
    print(f"\nResult Quality:")
    for i, result in enumerate(results[:3], 1):
        print(f"   {i}. {result['title']} - Score: {result['score']:.3f}")
    
    print(f"\nSystem Statistics:")
    print(f" Total medical articles: 4,203")
    print(f" Total searchable chunks: {len(embeddings_data):,}")
    print(f" Embedding dimensions: 384")
    print(f" Model: all-MiniLM-L6-v2")
    print(f" Average search time: ~{search_time:.3f} seconds")

analyze_system_performance()

System Performance Analysis
Searching for: 'diabetes symptoms treatment'
Search Speed:
 Query: 'diabetes symptoms treatment'
 Time: 8.142 seconds
 Chunks searched: 15,906
 Speed: 1954 chunks/second

Result Quality:
   1. Drug-induced low blood sugar - Score: 0.632
   2. Hypothalamus - Score: 0.623
   3. Type 1 diabetes - Score: 0.618

System Statistics:
 Total medical articles: 4,203
 Total searchable chunks: 15,906
 Embedding dimensions: 384
 Model: all-MiniLM-L6-v2
 Average search time: ~8.142 seconds


## Uploading embeddings to DynamoDB

In [13]:
import boto3
from decimal import Decimal
import json

dynamodb = boto3.resource('dynamodb')
embeddings_table = dynamodb.Table('MedicalEmbeddings')

def deduplicate_embeddings(embeddings_data):
    """Remove duplicate chunk_ids from embeddings data"""
    
    print("Checking for duplicates...")
    
    seen_chunk_ids = set()
    deduplicated_data = []
    duplicates_found = 0
    
    for embedding_record in embeddings_data:
        chunk_id = embedding_record['chunk_id']
        
        if chunk_id not in seen_chunk_ids:
            seen_chunk_ids.add(chunk_id)
            deduplicated_data.append(embedding_record)
        else:
            duplicates_found += 1
    
    print(f"Original: {len(embeddings_data):,} items")
    print(f"Duplicates removed: {duplicates_found:,}")
    print(f"Final unique items: {len(deduplicated_data):,}")
    
    return deduplicated_data

def upload_embeddings_to_dynamodb(embeddings_data, batch_size=20):
    """Upload embeddings to DynamoDB in smaller batches"""
    
    clean_data = deduplicate_embeddings(embeddings_data)
    
    print(f"Uploading {len(clean_data)} unique embeddings to DynamoDB...")
    
    success_count = 0
    error_count = 0
    
    for i in range(0, len(clean_data), batch_size):
        try:
            batch = clean_data[i:i + batch_size]
            
            with embeddings_table.batch_writer() as batch_writer:
                for embedding_record in batch:
                    embedding_list = [Decimal(str(float(x))) for x in embedding_record['embedding']]
                    
                    item = {
                        'chunk_id': embedding_record['chunk_id'],
                        'doc_id': embedding_record['doc_id'],
                        'section': embedding_record['section'],
                        'title': embedding_record['title'],
                        'content': embedding_record['content'][:4000],
                        'url': embedding_record['url'],
                        'embedding': embedding_list
                    }
                    
                    batch_writer.put_item(Item=item)
            
            success_count += len(batch)

            if success_count % 1000 == 0 or success_count >= len(clean_data):
                print(f" Uploaded {success_count:,}/{len(clean_data):,} embeddings")
                
        except Exception as e:
            error_count += len(batch)
            print(f" Error in batch {i//batch_size + 1}: {str(e)}")
            continue
    
    print(f"Upload complete! Success: {success_count:,}, Errors: {error_count}")
    return success_count

success_count = upload_embeddings_to_dynamodb(embeddings_data)

Checking for duplicates...
Original: 15,906 items
Duplicates removed: 6
Final unique items: 15,900
Uploading 15900 unique embeddings to DynamoDB...
 Uploaded 1,000/15,900 embeddings
 Uploaded 2,000/15,900 embeddings
 Uploaded 3,000/15,900 embeddings
 Uploaded 4,000/15,900 embeddings
 Uploaded 5,000/15,900 embeddings
 Uploaded 6,000/15,900 embeddings
 Uploaded 7,000/15,900 embeddings
 Uploaded 8,000/15,900 embeddings
 Uploaded 9,000/15,900 embeddings
 Uploaded 10,000/15,900 embeddings
 Uploaded 11,000/15,900 embeddings
 Uploaded 12,000/15,900 embeddings
 Uploaded 13,000/15,900 embeddings
 Uploaded 14,000/15,900 embeddings
 Uploaded 15,000/15,900 embeddings
 Uploaded 15,900/15,900 embeddings
Upload complete! Success: 15,900, Errors: 0


## Verifying upload and testing retrieval 

In [14]:
import time

def verify_dynamodb_upload():
    """Verifying embeddings are properly stored in DynamoDB"""
    
    print("Verifying DynamoDB Upload:")
    
    dynamodb_client = boto3.client('dynamodb')
    response = dynamodb_client.describe_table(TableName='MedicalEmbeddings')
    print(f"   Table status: {response['Table']['TableStatus']}")
    
    sample_chunk_id = embeddings_data[0]['chunk_id']
    response = embeddings_table.get_item(Key={'chunk_id': sample_chunk_id})
    
    if 'Item' in response:
        item = response['Item']
        print(f"Sample retrieval successful:")
        print(f"   Title: {item['title']}")
        print(f"   Section: {item['section']}")
        print(f"   Content length: {len(item['content'])} chars")
        print(f"   Embedding dimensions: {len(item['embedding'])}")
        print(f"   URL: {item['url']}")
        
        return True
    else:
        print("Sample retrieval failed")
        return False

def create_dynamodb_search_function():
    """Creating search function that uses DynamoDB instead of memory"""
    
    def search_medical_information_dynamodb(query, top_k=5):
        """Search using DynamoDB stored embeddings"""
        
        print(f"Searching DynamoDB for: '{query}'")
        start_time = time.time()
        
        query_embedding = model.encode([query])[0]
 
        print(" Scanning DynamoDB table...")
        response = embeddings_table.scan()
        items = response['Items']
        
        while 'LastEvaluatedKey' in response:
            response = embeddings_table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
            items.extend(response['Items'])
        
        print(f" Retrieved {len(items):,} items from DynamoDB")
        
        similarities = []
        for item in items:
            item_embedding = [float(x) for x in item['embedding']]
            
            similarity = cosine_similarity([query_embedding], [item_embedding])[0][0]
            
            similarities.append({
                'score': similarity,
                'title': item['title'],
                'section': item['section'],
                'content': item['content'],
                'url': item['url']
            })
        
        similarities.sort(key=lambda x: x['score'], reverse=True)
        
        search_time = time.time() - start_time
        print(f" Search completed in {search_time:.2f} seconds")
        
        return similarities[:top_k]
    
    return search_medical_information_dynamodb

verify_success = verify_dynamodb_upload()

if verify_success:
    search_dynamodb = create_dynamodb_search_function()
    print("\nDynamoDB search function ready!")
else:
    print("\nUpload verification failed!")

Verifying DynamoDB Upload:
   Table status: ACTIVE
Sample retrieval successful:
   Title: A guide to clinical trials for cancer
   Section: overview
   Content length: 348 chars
   Embedding dimensions: 384
   URL: https://medlineplus.gov/ency/patientinstructions/000823.htm

DynamoDB search function ready!


In [15]:
if verify_success:
    print("Testing DynamoDB Search System")
    print("=" * 50)
    
    test_query = "diabetes treatment options"
    results = search_dynamodb(test_query, top_k=3)
    
    print(f"\nResults for: '{test_query}'")
    print("-" * 40)
    
    for i, result in enumerate(results, 1):
        print(f"\n{i}. {result['title']} ({result['section']})")
        print(f"   Relevance: {result['score']:.3f}")
        print(f"   Content: {result['content'][:150]}...")
        if result['url']:
            print(f"   Source: {result['url']}")
    
    print(f"\nDynamoDB vector database is working perfectly!")
    
else:
    print("Cannot test - upload verification failed")

Testing DynamoDB Search System
Searching DynamoDB for: 'diabetes treatment options'
 Scanning DynamoDB table...
 Retrieved 15,900 items from DynamoDB
 Search completed in 57.49 seconds

Results for: 'diabetes treatment options'
----------------------------------------

1. Diabetes tests and checkups (treatment)
   Relevance: 0.786
   Content: Treatment options include:

• Ask your health care provider questions
• Learn more about your diabetes and what you can do to keep your blood sugar in...
   Source: https://medlineplus.gov/ency/patientinstructions/000082.htm

2. Weight-loss surgery and children (treatment)
   Relevance: 0.747
   Content: Treatment options include:

• Better control of diabetes
• Lower cholesterol and blood pressure
• Fewer sleep problems...
   Source: https://medlineplus.gov/ency/patientinstructions/000356.htm

3. Drug-induced low blood sugar (treatment)
   Relevance: 0.742
   Content: Treatment options include:

• Drinking alcohol
• Getting more activity than usu

## Installing and loading hugging face model

In [16]:
!pip install -q transformers torch

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("Loading Hugging Face model for text generation...")

try:
    summarizer = pipeline(
        "summarization", 
        model="facebook/bart-large-cnn",
        device=0 if torch.cuda.is_available() else -1
    )
    print("BART summarization model loaded successfully!")
    
except Exception as e:
    print(f" Error loading BART: {e}")
    
    try:
        summarizer = pipeline(
            "summarization",
            model="sshleifer/distilbart-cnn-12-6",
            device=-1  # CPU only
        )
        print("DistilBART model loaded successfully!")
    except Exception as e2:
        print(f"Error loading fallback model: {e2}")
        summarizer = None

if summarizer:
    test_text = "Diabetes is a group of metabolic disorders characterized by high blood sugar levels over a prolonged period."
    test_summary = summarizer(test_text, max_length=50, min_length=20)
    print(f"Test summary: {test_summary[0]['summary_text']}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading Hugging Face model for text generation...


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 50, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


BART summarization model loaded successfully!
Test summary: Diabetes is a group of metabolic disorders characterized by high blood sugar levels over a prolonged period.


## Enhanced medical RAG with LLM

In [17]:
def enhanced_medical_search_with_llm(query, top_k=3):
    """Enhanced search that combines retrieval with LLM generation"""
    
    print(f"Enhanced search for: '{query}'")
    
    if 'search_dynamodb' in globals():
        retrieved_results = search_dynamodb(query, top_k=top_k)
    else:
        retrieved_results = search_medical_information(query, top_k=top_k)
    
    if not retrieved_results or not summarizer:
        return {
            'query': query,
            'retrieved_results': retrieved_results,
            'generated_response': "LLM not available, showing retrieved results only.",
            'method': 'retrieval_only'
        }
    
    combined_content = ""
    sources = []
    
    for result in retrieved_results:
        combined_content += f"\n{result['title']} ({result['section']}):\n{result['content'][:300]}...\n"
        sources.append({
            'title': result['title'],
            'section': result['section'],
            'url': result.get('url', ''),
            'relevance': result.get('score', 0)
        })

    try:
        medical_prompt = f"""Medical Question: {query}

Relevant Medical Information:
{combined_content}

Please provide a clear, accurate summary based on the medical information provided above."""
        
        if len(medical_prompt) > 1000:
            medical_prompt = medical_prompt[:1000] + "..."
        
        generated = summarizer(
            medical_prompt,
            max_length=200,
            min_length=50,
            do_sample=False
        )
        
        generated_response = generated[0]['summary_text']
        
    except Exception as e:
        print(f"LLM generation failed: {e}")
        generated_response = f"Based on medical sources: {retrieved_results[0]['content'][:200]}..."
    
    return {
        'query': query,
        'generated_response': generated_response,
        'retrieved_results': retrieved_results,
        'sources': sources,
        'method': 'retrieval_plus_generation',
        'timestamp': time.time()
    }

print("Enhanced Medical RAG with LLM ready!")

Enhanced Medical RAG with LLM ready!


## Testing Enhanced medical RAG system

In [18]:
test_queries = [
    "What are the symptoms of diabetes?",
    "How to treat high blood pressure?", 
    "What causes heart disease?",
    "Chest pain symptoms and causes"
]

print("Testing Enhanced Medical RAG System with LLM")
print("=" * 60)

for query in test_queries:
    print(f"\nQuery: {query}")
    print("-" * 50)
    
    enhanced_result = enhanced_medical_search_with_llm(query, top_k=3)
    
    print(f"**Generated Response:**")
    print(f"{enhanced_result['generated_response']}")
    
    print(f"\n**Retrieved Sources:**")
    for i, source in enumerate(enhanced_result['sources'][:2], 1):
        print(f"   {i}. {source['title']} ({source['section']})")
        if 'relevance' in source:
            print(f"      Relevance: {source['relevance']:.3f}")
    
    print(f"\nMethod: {enhanced_result['method']}")
    print("\n" + "="*60)

Testing Enhanced Medical RAG System with LLM

Query: What are the symptoms of diabetes?
--------------------------------------------------
Enhanced search for: 'What are the symptoms of diabetes?'
Searching DynamoDB for: 'What are the symptoms of diabetes?'
 Scanning DynamoDB table...
 Retrieved 15,900 items from DynamoDB
 Search completed in 57.14 seconds
**Generated Response:**
Diabetes (symptoms) includes: Urinating a lot, getting up more often than usual at night to urinate, blurry vision and weight loss. Diabetes self-care includes: Hunger, Fatigue, Urination, Blurry vision, Red skin rashes and Tinglin.

**Retrieved Sources:**
   1. Managing your blood sugar (symptoms)
      Relevance: 0.685
   2. Diabetes (symptoms)
      Relevance: 0.656

Method: retrieval_plus_generation


Query: How to treat high blood pressure?
--------------------------------------------------
Enhanced search for: 'How to treat high blood pressure?'
Searching DynamoDB for: 'How to treat high blood pressure?'

Your max_length is set to 200, but your input_length is only 189. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)


 Search completed in 55.17 seconds
**Generated Response:**
Please provide a clear, accurate summary based on the medical information provided above. Medical Question: How to treat high blood pressure? How do you get low-salt diet? What are your treatment options for polycystic kidney disease? What is your treatment for the aneurysm?

**Retrieved Sources:**
   1. Atherosclerosis (treatment)
      Relevance: 0.705
   2. Polycystic kidney disease (treatment)
      Relevance: 0.691

Method: retrieval_plus_generation


Query: What causes heart disease?
--------------------------------------------------
Enhanced search for: 'What causes heart disease?'
Searching DynamoDB for: 'What causes heart disease?'
 Scanning DynamoDB table...
 Retrieved 15,900 items from DynamoDB
 Search completed in 54.74 seconds
**Generated Response:**
Heart failure in children (causes): Infection from a virus or bacteria that causes damage to the heart muscle or heart valves. Drugs used for other illnesses, most oft

## Comparing systems

In [19]:
comparison_query = "diabetes treatment and management"

print("SYSTEM COMPARISON")
print("=" * 50)

print("Traditional Retrieval Results:")
traditional_results = search_medical_information(comparison_query, top_k=3)
for i, result in enumerate(traditional_results[:2], 1):
    print(f"{i}. {result['title']}: {result['content'][:100]}...")

print("\n" + "-"*50)

print("Enhanced with LLM:")
enhanced_result = enhanced_medical_search_with_llm(comparison_query, top_k=3)
print(f"Generated Response: {enhanced_result['generated_response']}")


SYSTEM COMPARISON
Traditional Retrieval Results:
Searching for: 'diabetes treatment and management'
1. Diabetes tests and checkups: Treatment options include:

• Ask your health care provider questions
• Learn more about your diabet...
2. Drug-induced low blood sugar: Treatment options include:

• Drinking alcohol
• Getting more activity than usual
• Intentionally or...

--------------------------------------------------
Enhanced with LLM:
Enhanced search for: 'diabetes treatment and management'
Searching DynamoDB for: 'diabetes treatment and management'
 Scanning DynamoDB table...
 Retrieved 15,900 items from DynamoDB


Your max_length is set to 200, but your input_length is only 188. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)


 Search completed in 56.06 seconds
Generated Response: Please provide a clear, accurate summary based on the medical information provided above. Medical Question: diabetes treatment and management. Relevant Medical Information: diabetes tests and checkups (treatment), drug-induced low blood sugar (treatment) and managing blood sugar during exercise.


## Improved LLM Prompting

In [20]:
def enhanced_medical_search_with_improved_llm(query, top_k=3):
    """Enhanced search with better LLM prompting"""
    
    print(f"Enhanced search for: '{query}'")
    
    if 'search_dynamodb' in globals():
        retrieved_results = search_dynamodb(query, top_k=top_k)
    else:
        retrieved_results = search_medical_information(query, top_k=top_k)
    
    if not retrieved_results or not summarizer:
        return {
            'query': query,
            'retrieved_results': retrieved_results,
            'generated_response': "Showing retrieved medical information.",
            'method': 'retrieval_only'
        }
    
    medical_facts = []
    sources = []
    
    for result in retrieved_results:
        content = result['content'].strip()
        if content:
            medical_facts.append(f"From {result['title']} ({result['section']}): {content[:200]}")
            sources.append({
                'title': result['title'],
                'section': result['section'],
                'url': result.get('url', ''),
                'relevance': result.get('score', 0)
            })
    
    try:
        combined_facts = "\n\n".join(medical_facts[:3])
        
        medical_content = f"""Medical information about {query}:

{combined_facts}

Based on this medical information, here are the key points:"""
        
        generated = summarizer(
            medical_content,
            max_length=min(150, len(medical_content.split()) + 50),
            min_length=30,
            do_sample=False,
            no_repeat_ngram_size=3
        )
        
        generated_response = generated[0]['summary_text']
        
        if generated_response.startswith("Medical information about"):
            generated_response = generated_response.split("Based on this medical information, here are the key points:")[-1].strip()
        
    except Exception as e:
        print(f"LLM generation failed: {e}")
        generated_response = f"According to {retrieved_results[0]['title']}: {retrieved_results[0]['content'][:150]}..."
    
    return {
        'query': query,
        'generated_response': generated_response,
        'retrieved_results': retrieved_results,
        'sources': sources,
        'method': 'enhanced_rag',
        'timestamp': time.time()
    }

print("Improved Medical RAG ready!")

Improved Medical RAG ready!


## Testing improved system

In [21]:
test_query = "How to treat high blood pressure?"

print("Testing Improved Medical RAG")
print("=" * 50)

improved_result = enhanced_medical_search_with_improved_llm(test_query)

print(f"Query: {test_query}")
print(f"\nEnhanced Response:")
print(f"{improved_result['generated_response']}")

print(f"\nSources Used:")
for i, source in enumerate(improved_result['sources'], 1):
    print(f"   {i}. {source['title']} ({source['section']}) - Relevance: {source['relevance']:.3f}")


Testing Improved Medical RAG
Enhanced search for: 'How to treat high blood pressure?'
Searching DynamoDB for: 'How to treat high blood pressure?'
 Scanning DynamoDB table...
 Retrieved 15,900 items from DynamoDB
 Search completed in 54.21 seconds
Query: How to treat high blood pressure?

Enhanced Response:
Do not stop or change high blood pressure medicines without talking to your provider. Eat a heart-healthy diet. Get regular exercise. Stop smoking (if you smoke).

Sources Used:
   1. Atherosclerosis (treatment) - Relevance: 0.705
   2. Polycystic kidney disease (treatment) - Relevance: 0.691
   3. Abdominal aortic aneurysm repair - open - discharge (treatment) - Relevance: 0.633


## Test -Check available diabetes content

In [22]:
def find_diabetes_content():
    """Finding all diabetes-related content in database"""
    
    diabetes_content = []
    
    for doc in medical_documents_clean:
        title = doc['title'].lower()
        content = doc['content'].lower()
        
        if any(term in title or term in content for term in ['diabetes', 'diabetic', 'insulin', 'blood sugar']):
            diabetes_content.append({
                'title': doc['title'],
                'section': doc['section'],
                'content': doc['content'][:200] + '...',
                'url': doc['url']
            })
    
    return diabetes_content

diabetes_articles = find_diabetes_content()

print(f"Found {len(diabetes_articles)} diabetes-related chunks")
print("\nDiabetes Treatment Content:")

treatment_content = [doc for doc in diabetes_articles if 'treatment' in doc['section']]
print(f"Treatment sections: {len(treatment_content)}")

for doc in treatment_content[:5]:
    print(f"\n• {doc['title']} ({doc['section']})")
    print(f"  Content: {doc['content']}")

Found 682 diabetes-related chunks

Diabetes Treatment Content:
Treatment sections: 140

• Abdominal aortic aneurysm repair - open - discharge (treatment)
  Content: Treatment options include:

• Eat a heart-healthy diet.
• Get regular exercise.
• Stop smoking (if you smoke).
• Take the medicines your health care provider has prescribed as directed. These may incl...

• Acanthosis nigricans (treatment)
  Content: Treatment options include:

• Blood tests to check blood sugar level or insulin level
• Endoscopy
• X-rays...

• Acromegaly (treatment)
  Content: Treatment options include:

• Blood glucose
• Growth hormone
• Insulin-like growth factor 1 (IGF-1)
• Prolactin
• Spine x-ray
• MRI of the brain, including the pituitary gland
• Echocardiogram
• Colon...

• Adrenal glands (treatment)
  Content: Treatment options include:

• Addison disease, also called adrenal insufficiency -- an autoimmune disorder that causes the adrenal glands to not produce enough hormones
• Congenital adrenal hy

## Finding specific diabetes articles

In [23]:
def find_primary_diabetes_articles():
    """Find articles where diabetes is the main topic"""
    
    primary_diabetes = []
    
    for doc in medical_documents_clean:
        title = doc['title'].lower()
        
        # Look for articles where diabetes is the main subject
        if any(term in title for term in ['diabetes', 'diabetic']):
            primary_diabetes.append({
                'title': doc['title'],
                'section': doc['section'],
                'content': doc['content'][:300] + '...',
                'url': doc['url']
            })
    
    return primary_diabetes

primary_diabetes_articles = find_primary_diabetes_articles()

print(f"Found {len(primary_diabetes_articles)} articles specifically about diabetes")

from collections import defaultdict
grouped = defaultdict(list)

for doc in primary_diabetes_articles:
    grouped[doc['title']].append(doc['section'])

print(f"\nPrimary Diabetes Articles:")
for title, sections in list(grouped.items())[:10]:
    print(f"\n• {title}")
    print(f"  Sections: {', '.join(sections)}")

print(f"\nDiabetes Treatment Sections:")
treatment_sections = [doc for doc in primary_diabetes_articles if 'treatment' in doc['section']]

for doc in treatment_sections[:5]:
    print(f"\n• {doc['title']} - {doc['section']}")
    print(f"  {doc['content']}")

Found 176 articles specifically about diabetes

Primary Diabetes Articles:

• Central diabetes insipidus
  Sections: overview, symptoms, diagnosis, treatment

• Cranial mononeuropathy III - diabetic type
  Sections: overview, symptoms, causes, treatment, complications

• Diabetes
  Sections: overview, symptoms, causes, diagnosis, treatment, complications

• Diabetes â resources
  Sections: overview, treatment, prevention, complications

• Diabetes - foot ulcers
  Sections: overview, symptoms, causes, diagnosis, treatment

• Diabetes - insulin therapy
  Sections: causes, treatment

• Diabetes - keeping active
  Sections: overview, symptoms, causes, complications

• Diabetes - preventing heart attack and stroke
  Sections: causes, treatment, complications

• Diabetes - taking care of your feet
  Sections: causes, treatment, complications

• Diabetes - when you are sick
  Sections: overview, symptoms, causes, treatment

Diabetes Treatment Sections:

• Central diabetes insipidus - treatm

## Checking DynamoDB content

In [24]:
def check_dynamodb_diabetes_content():
    """Check what diabetes content is actually in DynamoDB"""
    
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('MedicalEmbeddings')

    response = table.scan(
        FilterExpression='contains(title, :diabetes)',
        ExpressionAttributeValues={':diabetes': 'Diabetes'}
    )
    
    items = response['Items']
    
    print(f"Found {len(items)} diabetes items in DynamoDB")

    treatment_items = [item for item in items if item.get('section') == 'treatment']
    
    print(f"Diabetes treatment items: {len(treatment_items)}")
    
    for item in treatment_items[:5]:
        print(f"\n• {item.get('title')} - {item.get('section')}")
        print(f"  Content: {item.get('content', '')[:200]}...")
        print(f"  Chunk ID: {item.get('chunk_id')}")

check_dynamodb_diabetes_content()

Found 2 diabetes items in DynamoDB
Diabetes treatment items: 0


## Uploading missing dynamodb content

In [25]:
def upload_missing_diabetes_content():
    """Upload diabetes-specific content that was missed"""
    
    print("Finding diabetes content to upload...")
    
    diabetes_content = []
    for doc in medical_documents_clean:
        title = doc['title'].lower()
        if 'diabetes' in title:
            diabetes_content.append(doc)
    
    print(f"Found {len(diabetes_content)} diabetes articles to upload")
    
    diabetes_embeddings = []
    for doc in diabetes_content:
        for emb in embeddings_data:
            if emb['chunk_id'] == doc['chunk_id']:
                diabetes_embeddings.append(emb)
                break
    
    print(f"Found {len(diabetes_embeddings)} diabetes embeddings to upload")

    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('MedicalEmbeddings')
    
    uploaded = 0
    
    for emb in diabetes_embeddings:
        try:
            embedding_list = [Decimal(str(float(x))) for x in emb['embedding']]
            
            item = {
                'chunk_id': emb['chunk_id'],
                'doc_id': emb['doc_id'],
                'section': emb['section'],
                'title': emb['title'],
                'content': emb['content'][:4000],
                'url': emb['url'],
                'embedding': embedding_list
            }
            
            table.put_item(Item=item)
            uploaded += 1
            
            if uploaded % 10 == 0:
                print(f" Uploaded {uploaded} diabetes items...")
                
        except Exception as e:
            print(f" Error uploading {emb['chunk_id']}: {str(e)}")
    
    print(f" Successfully uploaded {uploaded} diabetes items!")

upload_missing_diabetes_content()

Finding diabetes content to upload...
Found 149 diabetes articles to upload
Found 149 diabetes embeddings to upload
 Uploaded 10 diabetes items...
 Uploaded 20 diabetes items...
 Uploaded 30 diabetes items...
 Uploaded 40 diabetes items...
 Uploaded 50 diabetes items...
 Uploaded 60 diabetes items...
 Uploaded 70 diabetes items...
 Uploaded 80 diabetes items...
 Uploaded 90 diabetes items...
 Uploaded 100 diabetes items...
 Uploaded 110 diabetes items...
 Uploaded 120 diabetes items...
 Uploaded 130 diabetes items...
 Uploaded 140 diabetes items...
 Successfully uploaded 149 diabetes items!


## Verifying the diabetes upload

In [26]:
def verify_diabetes_upload():
    """Check if diabetes treatment content is now in DynamoDB"""
    
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('MedicalEmbeddings')
    
    response = table.scan(
        FilterExpression='contains(title, :diabetes) AND #section = :treatment',
        ExpressionAttributeNames={'#section': 'section'},
        ExpressionAttributeValues={
            ':diabetes': 'Diabetes',
            ':treatment': 'treatment'
        }
    )
    
    items = response['Items']
    
    print(f"Found {len(items)} diabetes treatment items in DynamoDB")
    
    for item in items:
        print(f"\n• Title: {item.get('title')}")
        print(f"  Section: {item.get('section')}")
        print(f"  Chunk ID: {item.get('chunk_id')}")
        print(f"  Content preview: {item.get('content', '')[:150]}...")

    response2 = table.scan(
        FilterExpression='contains(title, :diabetes)',
        ExpressionAttributeValues={':diabetes': 'Diabetes'}
    )
    
    print(f"\nTotal diabetes items in DynamoDB: {len(response2['Items'])}")
    
    # Show titles
    titles = set()
    for item in response2['Items']:
        titles.add(item.get('title'))
    
    print(f"\nDiabetes article titles:")
    for title in sorted(titles):
        print(f"  • {title}")

verify_diabetes_upload()

Found 0 diabetes treatment items in DynamoDB

Total diabetes items in DynamoDB: 2

Diabetes article titles:
  • Diabetes and nerve damage
  • Diabetes eye care


## Debugging upload issue

In [27]:
def debug_upload_issue():
    """Debugging why diabetes content isn't uploading properly"""
    
    print("Debugging upload issue...")
    
    diabetes_treatment = None
    
    for doc in medical_documents_clean:
        if doc['title'].lower() == 'diabetes' and doc['section'] == 'treatment':
            diabetes_treatment = doc
            break
    
    if diabetes_treatment:
        print(f"Found 'Diabetes - treatment' in local data:")
        print(f"   Title: {diabetes_treatment['title']}")
        print(f"   Section: {diabetes_treatment['section']}")
        print(f"   Chunk ID: {diabetes_treatment['chunk_id']}")
        print(f"   Content: {diabetes_treatment['content'][:200]}...")

        try:
            embedding_item = None
            for emb in embeddings_data:
                if emb['chunk_id'] == diabetes_treatment['chunk_id']:
                    embedding_item = emb
                    break
            
            if embedding_item:
                print(f"Found corresponding embedding")
                
                # Try manual upload
                dynamodb = boto3.resource('dynamodb')
                table = dynamodb.Table('MedicalEmbeddings')
                
                embedding_list = [Decimal(str(float(x))) for x in embedding_item['embedding']]
                
                item = {
                    'chunk_id': embedding_item['chunk_id'],
                    'doc_id': embedding_item['doc_id'],
                    'section': embedding_item['section'],
                    'title': embedding_item['title'],
                    'content': embedding_item['content'],
                    'url': embedding_item['url'],
                    'embedding': embedding_list
                }
                
                table.put_item(Item=item)
                print(f"Successfully uploaded diabetes treatment item!")
                
            else:
                print(f"No corresponding embedding found")
        
        except Exception as e:
            print(f"Upload error: {str(e)}")
    
    else:
        print(f"'Diabetes - treatment' not found in local data")
        
        diabetes_titles = set()
        for doc in medical_documents_clean:
            if 'diabetes' in doc['title'].lower():
                diabetes_titles.add(f"{doc['title']} - {doc['section']}")
        
        print(f"\nAvailable diabetes content:")
        for title in sorted(diabetes_titles):
            print(f"   • {title}")

debug_upload_issue()

Debugging upload issue...
Found 'Diabetes - treatment' in local data:
   Title: Diabetes
   Section: treatment
   Chunk ID: diabetes_treatment
   Content: Treatment options include:

• Fasting blood glucose level. Diabetes is diagnosed if the fasting glucose level is 126 mg/dL (7.0 mmol/L) or higher on two different tests, when the person is in their us...
Found corresponding embedding
Successfully uploaded diabetes treatment item!


## Debug issue

In [28]:
def comprehensive_diabetes_debug():
    """
    Comprehensive debugging to find and upload all diabetes-related content
    """
    print("=== COMPREHENSIVE DIABETES DEBUG ===\n")
    
    diabetes_content = []
    
    for doc in medical_documents_clean:
        if 'diabetes' in doc['title'].lower():
            diabetes_content.append({
                'title': doc['title'],
                'section': doc['section'], 
                'chunk_id': doc['chunk_id'],
                'content': doc['content'][:150] + "...",
                'url': doc.get('url', 'N/A')
            })
    
    print(f"Found {len(diabetes_content)} diabetes-related chunks in local data:")
    for i, content in enumerate(diabetes_content, 1):
        print(f"{i:2d}. {content['title']} - {content['section']}")
        print(f"     Chunk ID: {content['chunk_id']}")
        print(f"     Content: {content['content']}")
        print(f"     URL: {content['url']}\n")
    
    print("=== CHECKING DYNAMODB CONTENT ===")
    
    try:
        dynamodb = boto3.resource('dynamodb')
        table = dynamodb.Table('MedicalEmbeddings')

        response = table.scan(
            FilterExpression=Attr('title').contains('diabetes') | Attr('title').contains('Diabetes'),
            ProjectionExpression='chunk_id, title, #section',
            ExpressionAttributeNames={'#section': 'section'}
        )
        
        db_diabetes = response['Items']
        print(f"Found {len(db_diabetes)} diabetes items in DynamoDB:")
        
        for item in db_diabetes:
            print(f"   • {item['title']} - {item['section']} (ID: {item['chunk_id']})")
            
    except Exception as e:
        print(f"DynamoDB check failed: {str(e)}")
    
    print("\n=== UPLOADING MISSING DIABETES CONTENT ===")
    
    uploaded_count = 0
    
    for doc in medical_documents_clean:
        if 'diabetes' in doc['title'].lower():
            try:
                embedding_item = None
                for emb in embeddings_data:
                    if emb['chunk_id'] == doc['chunk_id']:
                        embedding_item = emb
                        break
                
                if embedding_item:
                    embedding_list = [Decimal(str(float(x))) for x in embedding_item['embedding']]
                    
                    item = {
                        'chunk_id': embedding_item['chunk_id'],
                        'doc_id': embedding_item['doc_id'],
                        'section': embedding_item['section'],
                        'title': embedding_item['title'],
                        'content': embedding_item['content'],
                        'url': embedding_item['url'],
                        'embedding': embedding_list
                    }

                    table.put_item(Item=item)
                    print(f"Uploaded: {item['title']} - {item['section']}")
                    uploaded_count += 1
                    
                else:
                    print(f"No embedding found for: {doc['title']} - {doc['section']}")
                    
            except Exception as e:
                print(f"Upload failed for {doc['title']} - {doc['section']}: {str(e)}")
    
    print(f"\nSuccessfully uploaded {uploaded_count} diabetes items!")
    
    return diabetes_content

def test_lambda_search_locally():
    """
    Testing the same search logic locally to debug
    """
    print("\n=== TESTING SEARCH LOGIC LOCALLY ===")
    
    query = "diabetes treatment"
    query_lower = query.lower()
    
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('MedicalEmbeddings')
    
    response = table.scan(
        ProjectionExpression='chunk_id, title, #section, content, #url',
        ExpressionAttributeNames={
            '#url': 'url',
            '#section': 'section'
        },
        Limit=3000
    )
    items = response['Items']
    
    print(f"Scanned {len(items)} total items from DynamoDB")
    
    scored_items = []
    
    for item in items:
        content = item.get('content', '').lower()
        title = item.get('title', '').lower()
        section = item.get('section', '').lower()

        score = 0

        if 'diabetes' in title and 'treatment' in section:
            score += 100
        elif 'diabetes' in title:
            if 'treatment' in query.lower():
                if 'treatment' in section:
                    score += 80
                elif 'treatment' in content:
                    score += 60
            else:
                score += 40
        elif 'diabetes' in content and 'treatment' in section:
            diabetes_count = content.count('diabetes')
            score += 30 + (diabetes_count * 5)
        elif 'diabetes' in content and 'treatment' in query.lower():
            score += 10

        if score >= 10:
            scored_items.append({
                'score': score,
                'title': item.get('title', ''),
                'section': item.get('section', ''),
                'content': item.get('content', '')[:200] + "...",
                'url': item.get('url', ''),
                'exact_match': score >= 80
            })
    
    scored_items.sort(key=lambda x: x['score'], reverse=True)
    
    print(f"\nFound {len(scored_items)} relevant items:")
    for i, item in enumerate(scored_items[:10], 1):
        print(f"{i:2d}. Score: {item['score']:3d} | {item['title']} - {item['section']}")
        print(f"     Content: {item['content']}")
        print()
    
    return scored_items

diabetes_debug_results = comprehensive_diabetes_debug()

local_search_results = test_lambda_search_locally()

=== COMPREHENSIVE DIABETES DEBUG ===

Found 149 diabetes-related chunks in local data:
 1. Central diabetes insipidus - overview
     Chunk ID: central_diabetes_insipidus_overview
     Content: Central diabetes insipidus is a rare condition that involves extreme thirst and excessive urination....
     URL: https://medlineplus.gov/ency/article/000460.htm

 2. Central diabetes insipidus - symptoms
     Chunk ID: central_diabetes_insipidus_symptoms
     Content: Symptoms of this condition include:

• Increased urine production
• Excessive thirst
• Confusion and changes in alertness due to dehydration and highe...
     URL: https://medlineplus.gov/ency/article/000460.htm

 3. Central diabetes insipidus - diagnosis
     Chunk ID: central_diabetes_insipidus_diagnosis
     Content: Diagnosis may involve:

• Blood sodiumandosmolarity
• Desmopressin (DDAVP) challenge
• MRI of the head
• Urinalysis
• Urine concentrationand osmolarit...
     URL: https://medlineplus.gov/ency/article/000460.htm

 

## Lambda clear cache

In [None]:
import boto3
import json

def clear_lambda_cache():
    """
    Clear the QueryCache table to force fresh searches
    """
    
    print("=== CLEARING LAMBDA CACHE ===")
    
    try:
        dynamodb = boto3.resource('dynamodb')
        cache_table = dynamodb.Table('QueryCache')
        
        response = cache_table.scan()
        items = response['Items']
        
        print(f"Found {len(items)} cached queries:")
        
        for item in items:
            print(f"  • {item.get('query', 'Unknown query')}")
        
        deleted_count = 0
        
        for item in items:
            try:
                cache_table.delete_item(
                    Key={'query_hash': item['query_hash']}
                )
                deleted_count += 1
            except Exception as e:
                print(f"Failed to delete item: {str(e)}")
        
        print(f"\nSuccessfully cleared {deleted_count} cached items")
        print("Next Lambda request will use the new universal search algorithm!")
        
    except Exception as e:
        print(f"Failed to clear cache: {str(e)}")

def test_cache_clearing():
    """
    Test a query to see if cache was cleared
    """
    print("\n=== TESTING CACHE CLEARING ===")
    
    print("After clearing cache, your next Lambda test should:")
    print("1. Show 'cached': false")
    print("2. Show 'method': 'universal_medical_search'") 
    print("3. Return actual diabetes treatment content")
    print("4. Have much higher relevance scores (100+ for exact matches)")

clear_lambda_cache()
test_cache_clearing()

=== CLEARING LAMBDA CACHE ===
Found 3 cached queries:
  • what are the causes of headache
  • headache causes
  • migraine treatment

✅ Successfully cleared 3 cached items
Next Lambda request will use the new universal search algorithm!

=== TESTING CACHE CLEARING ===
After clearing cache, your next Lambda test should:
1. Show 'cached': false
2. Show 'method': 'universal_medical_search'
3. Return actual diabetes treatment content
4. Have much higher relevance scores (100+ for exact matches)
