In [1]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyCS08G9sKCceHkCHZdpBVV9fS_RoQSjPyM")

print("🔍 Checking available Gemini models...")

try:
    models = genai.list_models()
    for model in models:
        if 'gemini' in model.name.lower():
            print(f"✅ Found: {model.name}")
            print(f"   Supported methods: {model.supported_generation_methods}")
except Exception as e:
    print(f"❌ Error listing models: {e}")

🔍 Checking available Gemini models...
✅ Found: models/gemini-2.5-flash
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
✅ Found: models/gemini-2.5-pro
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
✅ Found: models/gemini-2.0-flash-exp
   Supported methods: ['generateContent', 'countTokens', 'bidiGenerateContent']
✅ Found: models/gemini-2.0-flash
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
✅ Found: models/gemini-2.0-flash-001
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
✅ Found: models/gemini-2.0-flash-exp-image-generation
   Supported methods: ['generateContent', 'countTokens', 'bidiGenerateContent']
✅ Found: models/gemini-2.0-flash-lite-001
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
✅ Found: models

In [2]:
# Step 1: Test with the actual available models
import google.generativeai as genai
genai.configure(api_key="AIzaSyCS08G9sKCceHkCHZdpBVV9fS_RoQSjPyM")

print("Testing with available models...")

# Use the models that are actually available
available_models = [
    "models/gemini-2.0-flash",  # Fast and efficient
    "models/gemini-2.0-flash-001",
    "models/gemini-pro-latest",  # Latest stable version
    "models/gemini-2.5-flash",   # Newest model
]

for model_name in available_models:
    try:
        print(f"Testing: {model_name}")
        model = genai.GenerativeModel(model_name)
        response = model.generate_content("Say 'Medical RAG System Active' in one sentence.")
        print(f"SUCCESS with {model_name}!")
        print(f"Response: {response.text}")
        WORKING_MODEL = model_name
        break
    except Exception as e:
        print(f"Failed with {model_name}: {str(e)[:100]}...")

print(f"\n Working model: {WORKING_MODEL}")

Testing with available models...
Testing: models/gemini-2.0-flash
SUCCESS with models/gemini-2.0-flash!
Response: Medical RAG System Active.


 Working model: models/gemini-2.0-flash


In [4]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyCS08G9sKCceHkCHZdpBVV9fS_RoQSjPyM")

# Use the working model (gemini-2.0-flash is usually reliable and fast)
MODEL_NAME = "models/gemini-2.0-flash"  # or use WORKING_MODEL from above

def query_gemini(prompt, context=""):
    """Function to query Gemini with optional context"""
    try:
        model = genai.GenerativeModel(MODEL_NAME)

        if context:
            full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer based on the context above:"
        else:
            full_prompt = prompt

        response = model.generate_content(full_prompt)
        return response.text
    except Exception as e:
        return f"Error: {str(e)}"

# Test the function
print("Testing our Gemini query function...")
test_response = query_gemini("What is the purpose of a RAG system?")
print(f"Test successful!")
print(f"Response: {test_response}")
print(f"Model: {MODEL_NAME}")

Testing our Gemini query function...
Test successful!
Response: The purpose of a Retrieval-Augmented Generation (RAG) system is to enhance the performance of large language models (LLMs) by providing them with access to external knowledge sources during the generation process.  Instead of relying solely on the information they were trained on, RAG systems can dynamically retrieve relevant information from a knowledge base and incorporate it into their responses.

Here's a breakdown of the key purposes:

*   **Improved Accuracy and Factual Grounding:**  LLMs are trained on vast datasets but can still hallucinate or generate incorrect information. RAG reduces this issue by grounding the LLM's responses in verifiable evidence from the retrieved knowledge base. This leads to more accurate and reliable answers.

*   **Access to Up-to-Date Information:**  LLMs have a knowledge cutoff date, meaning they are unaware of events or information that occurred after their training. RAG allows LLMs t

In [9]:
from google.colab import files
import zipfile, io, os, pandas as pd

uploaded = files.upload()  # Click the Choose Files button and upload mtsamples.csv.zip

# find the uploaded zip filename
zip_name = None
for fn in uploaded.keys():
    if fn.lower().endswith(".zip"):
        zip_name = fn
        break

if not zip_name:
    raise FileNotFoundError("No ZIP file uploaded. Upload the mtsamples .zip via the file chooser.")

extract_path = "medical_rag/data/"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_name, 'r') as z:
    z.extractall(extract_path)

print("Extracted files:", os.listdir(extract_path))
csvs = [f for f in os.listdir(extract_path) if f.endswith(".csv")]
if csvs:
    df = pd.read_csv(os.path.join(extract_path, csvs[0]))
    print("Loaded:", csvs[0], "shape:", df.shape)
else:
    print("No CSV found after extraction.")


Saving mtsamples.csv.zip to mtsamples.csv (1).zip
Extracted files: ['mtsamples.csv']
Loaded: mtsamples.csv shape: (4999, 6)


In [15]:
import pandas as pd

csv_path = "/content/medical_rag/data/mtsamples.csv"
medical_df = pd.read_csv(csv_path)

print("CSV Loaded Successfully!")
print("Shape:", medical_df.shape)


CSV Loaded Successfully!
Shape: (4999, 6)


In [16]:
print(" EXPLORING MEDICAL DATASET")
print("=" * 50)

# Shape
print(f" Dataset shape: {medical_df.shape}")

# Columns
print(f" Columns: {list(medical_df.columns)}")

# First 3 rows
print("\n First 3 rows:")
print(medical_df.head(3))

# Dataset info
print("\n Dataset info:")
medical_df.info()   # <-- FIXED

# Basic statistics
print("\n Basic statistics:")
print(medical_df.describe(include='all'))

# Missing values
print("\n Missing values:")
print(medical_df.isnull().sum())

# Unique values in categorical columns
print("\n Unique values in categorical columns:")
for col in medical_df.columns:
    if medical_df[col].dtype == 'object':
        print(f"{col}: {medical_df[col].nunique()} unique values")
        if medical_df[col].nunique() < 10:
            print(f"   Values: {medical_df[col].unique()}")


 EXPLORING MEDICAL DATASET
 Dataset shape: (4999, 6)
 Columns: ['Unnamed: 0', 'description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']

 First 3 rows:
   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   

       medical_specialty                                sample_name  \
0   Allergy / Immunology                         Allergic Rhinitis    
1             Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2             Bariatrics   Laparoscopic Gastric Bypass Consult - 1    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL HISTORY:, He has difficulty climb...   
2  HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   

                                           

In [14]:
!ls -R /content


/content:
 medical_rag		 'mtsamples.csv (2).zip'   sample_data
'mtsamples.csv (1).zip'   mtsamples.csv.zip

/content/medical_rag:
data

/content/medical_rag/data:
mtsamples.csv

/content/sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


In [22]:
print(" PREPROCESSING MEDICAL DATA")
print("=" * 50)

# Clean the data
def preprocess_medical_data(df):
    # Create a copy
    processed_df = df.copy()

    # 1. Handle missing values
    print("1. Handling missing values...")
    initial_rows = len(processed_df)
    processed_df = processed_df.dropna()
    print(f"   Removed {initial_rows - len(processed_df)} rows with missing values")

    # 2. Basic text cleaning
    print("2. Cleaning text data...")
    text_columns = []
    for col in processed_df.columns:
        if processed_df[col].dtype == 'object':
            text_columns.append(col)
            processed_df[col] = processed_df[col].astype(str).str.strip()
            processed_df[col] = processed_df[col].fillna('')

    print(f"   Text columns identified: {text_columns}")

    # 3. Add metadata
    print("3. Adding metadata...")
    main_text_col = text_columns[0]       # Use the first text column (usually 'transcription')
    processed_df['text_length'] = processed_df[main_text_col].str.len()
    processed_df['word_count'] = processed_df[main_text_col].str.split().str.len()

    return processed_df, text_columns

# Apply preprocessing
medical_clean, text_cols = preprocess_medical_data(medical_df)

print("\n Preprocessing complete!")
print(" Clean dataset shape:", medical_clean.shape)
print(" Text columns:", text_cols)

# Show sample of cleaned data
print(f"\n Sample o)f cleaned data:")
print(medical_clean.head())


 PREPROCESSING MEDICAL DATA
1. Handling missing values...
   Removed 1101 rows with missing values
2. Cleaning text data...
   Text columns identified: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']
3. Adding metadata...

 Preprocessing complete!
 Clean dataset shape: (3898, 8)
 Text columns: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']

 Sample o)f cleaned data:
   Unnamed: 0                                        description  \
0           0  A 23-year-old white female presents with compl...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                               2-D M-Mode. Doppler.   
4           4                                 2-D Echocardiogram   

            medical_specialty                              sample_name  \
0        Allergy / Immunology                        Allergic Rhinitis   
1      

In [23]:
print(" IDENTIFYING MEDICAL CONTENT COLUMN")
print("=" * 50)

# Examine each text column
for col in text_cols:
    if len(medical_clean) == 0:
        sample_text = ""
        avg_len = 0
        avg_words = 0
    else:
        sample_text = medical_clean[col].iloc[0]
        avg_len = medical_clean[col].str.len().mean()
        avg_words = medical_clean[col].str.split().str.len().mean()

    print(f"\n Column: {col}")
    print(f"   Sample text: {str(sample_text)[:200]}...")
    print(f"   Average length: {avg_len:.0f} characters")
    print(f"   Average words: {avg_words:.0f} words")

# Let the user select the column for RAG
# Usually: 'transcription', 'text', 'description', etc.
preferred_column =_


 IDENTIFYING MEDICAL CONTENT COLUMN

 Column: description
   Sample text: A 23-year-old white female presents with complaint of allergies....
   Average length: 129 characters
   Average words: 18 words

 Column: medical_specialty
   Sample text: Allergy / Immunology...
   Average length: 14 characters
   Average words: 2 words

 Column: sample_name
   Sample text: Allergic Rhinitis...
   Average length: 25 characters
   Average words: 4 words

 Column: transcription
   Sample text: SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried ...
   Average length: 2673 characters
   Average words: 409 words

 Column: keywords
   Sample text: allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,...
   Average length: 224 characters
   Average words: 26 words


In [24]:
print("💾 SAVING PROCESSED DATA")
print("=" * 50)

# Save cleaned dataset
processed_file = "medical_rag/data/medical_data_processed.csv"
medical_clean.to_csv(processed_file, index=False)
print(f" Saved processed data to: {processed_file}")

# Determine main content column safely
if 'content_column' not in globals():
    # Fallback if Step 7 wasn't run
    content_column = 'transcription' if 'transcription' in medical_clean.columns else text_cols[0]

# Summary
print("\n Final dataset info:")
print(f"   Rows: {len(medical_clean)}")
print(f"   Columns: {len(medical_clean.columns)}")
print(f"   Main content column: '{content_column}'")
if content_column in medical_clean.columns:
    print(f"   Average text length: {medical_clean[content_column].str.len().mean():.0f} chars")
else:
    print("   Average text length: N/A")

if 'medical_specialty' in medical_clean.columns:
    print(f"   Unique medical specialties: {medical_clean['medical_specialty'].nunique()}")
else:
    print("   Medical specialties column not found (N/A)")


💾 SAVING PROCESSED DATA
 Saved processed data to: medical_rag/data/medical_data_processed.csv

 Final dataset info:
   Rows: 3898
   Columns: 8
   Main content column: 'transcription'
   Average text length: 2673 chars
   Unique medical specialties: 39


In [25]:
# Install the specific LangChain components we need
print(" Installing required LangChain modules...")

!pip install -q "langchain>=0.1.0" "langchain-community>=0.0.10" "sentence-transformers" "faiss-cpu"

print("**Installation complete!**")

 Installing required LangChain modules...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m**Installation complete!**


In [27]:
print(" OPTIMIZED PROCESSING - USING BATCHES")
print("=" * 50)

import pandas as pd
import os
import gc

# Load processed dataset
processed_file = "medical_rag/data/medical_data_processed.csv"

# Check available columns first
temp_df = pd.read_csv(processed_file, nrows=1)
available_columns = list(temp_df.columns)
print(f"Available columns in CSV: {available_columns}")

# Select only columns that exist
essential_columns = ['transcription', 'medical_specialty', 'description', 'sample_name', 'keywords']
essential_columns = [col for col in essential_columns if col in available_columns]

medical_clean = pd.read_csv(processed_file, usecols=essential_columns)
print(f"Reloaded medical data (essential columns only): {medical_clean.shape}")

# Drop rows with missing transcription
if 'transcription' not in medical_clean.columns:
    raise ValueError(" 'transcription' column not found in the dataset!")

initial_count = len(medical_clean)
medical_clean = medical_clean.dropna(subset=['transcription'])
print(f"Removed {initial_count - len(medical_clean)} rows with missing transcriptions")
print(f"Processing {len(medical_clean)} records")

# --- Simple text splitter ---
class SimpleTextSplitter:
    def __init__(self, chunk_size=800, chunk_overlap=150):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text):
        if len(text) <= self.chunk_size:
            return [text]

        chunks = []
        start = 0
        while start < len(text):
            end = start + self.chunk_size
            if end >= len(text):
                chunks.append(text[start:])
                break

            # Look for sentence / newline / space breaks
            break_point = text.rfind('. ', start, end)
            if break_point == -1:
                break_point = text.rfind('\n', start, end)
            if break_point == -1:
                break_point = text.rfind(' ', start, end)
            if break_point != -1:
                end = break_point + 1

            chunks.append(text[start:end])
            start = max(start + 1, end - self.chunk_overlap)
        return chunks

# --- Batch processing ---
def process_batches(dataframe, batch_size=500):
    all_chunks = []
    chunk_metadata = []

    total_batches = (len(dataframe) + batch_size - 1) // batch_size
    text_splitter = SimpleTextSplitter(chunk_size=800, chunk_overlap=150)

    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(dataframe))
        batch = dataframe.iloc[start_idx:end_idx]

        print(f"Processing batch {batch_num + 1}/{total_batches} (rows {start_idx}-{end_idx})")

        for idx, row in batch.iterrows():
            transcription = str(row['transcription']).strip()
            if len(transcription) > 100:  # Only substantial text
                chunks = text_splitter.split_text(transcription)
                for chunk_idx, chunk in enumerate(chunks):
                    metadata = {
                        'chunk_id': f"{idx}_{chunk_idx}",
                        'original_index': idx,
                        'chunk_index': chunk_idx,
                        'chunk_length': len(chunk),
                        'word_count': len(chunk.split())
                    }
                    # Add optional metadata if columns exist
                    for col in ['medical_specialty', 'description', 'sample_name', 'keywords']:
                        if col in row:
                            metadata[col] = row[col]
                    all_chunks.append(chunk)
                    chunk_metadata.append(metadata)

        # Memory cleanup
        del batch
        if batch_num % 2 == 0:
            gc.collect()

    return all_chunks, chunk_metadata

# --- Run batch processing ---
print("\nSPLITTING MEDICAL TEXT INTO CHUNKS (BATCH PROCESSING)")
print("=" * 50)

all_chunks, chunk_metadata = process_batches(medical_clean, batch_size=500)
print(f"\nCreated {len(all_chunks)} chunks from {len(medical_clean)} records")
print(f"Average chunks per record: {len(all_chunks)/len(medical_clean):.1f}")

# --- Chunk statistics ---
chunk_df = pd.DataFrame(chunk_metadata)
print("\nChunk Statistics:")
print(f"Average chunk length: {chunk_df['chunk_length'].mean():.0f} characters")
print(f"Average word count: {chunk_df['word_count'].mean():.0f} words")
if 'medical_specialty' in chunk_df.columns:
    print(f"Medical specialties covered: {chunk_df['medical_specialty'].nunique()}")

# --- Sample chunks ---
print("\nSample chunks:")
for i in range(min(2, len(all_chunks))):
    print(f"\n--- Chunk {i+1} ---")
    if 'medical_specialty' in chunk_metadata[i]:
        print(f"Specialty: {chunk_metadata[i]['medical_specialty']}")
    print(f"Length: {chunk_metadata[i]['chunk_length']} chars")
    print(f"Text: {all_chunks[i][:150]}...")

# Free memory
del medical_clean
gc.collect()

print("\n Memory optimized - ready for next steps!")


 OPTIMIZED PROCESSING - USING BATCHES
Available columns in CSV: ['Unnamed: 0', 'description', 'medical_specialty', 'sample_name', 'transcription', 'keywords', 'text_length', 'word_count']
Reloaded medical data (essential columns only): (3898, 5)
Removed 0 rows with missing transcriptions
Processing 3898 records

SPLITTING MEDICAL TEXT INTO CHUNKS (BATCH PROCESSING)
Processing batch 1/8 (rows 0-500)
Processing batch 2/8 (rows 500-1000)
Processing batch 3/8 (rows 1000-1500)
Processing batch 4/8 (rows 1500-2000)
Processing batch 5/8 (rows 2000-2500)
Processing batch 6/8 (rows 2500-3000)
Processing batch 7/8 (rows 3000-3500)
Processing batch 8/8 (rows 3500-3898)

Created 29713 chunks from 3898 records
Average chunks per record: 7.6

Chunk Statistics:
Average chunk length: 455 characters
Average word count: 70 words
Medical specialties covered: 39

Sample chunks:

--- Chunk 1 ---
Specialty: Allergy / Immunology
Length: 504 chars
Text: SUBJECTIVE:,  This 23-year-old white female presents wit

In [28]:
# Install FAISS and other required dependencies
print(" Installing FAISS and dependencies...")

!pip install -q faiss-cpu sentence-transformers

print(" Installation complete!")

 Installing FAISS and dependencies...
 Installation complete!


In [29]:
print("CREATING EMBEDDINGS AND VECTOR STORE")
print("=" * 50)

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import gc

print("Loading embedding model...")

# Lightweight embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Test embedding model
print("Testing embedding model...")
test_embedding = embedding_model.encode(["Test medical embedding"], convert_to_numpy=True)
print(f"Embedding model working - vector dimension: {test_embedding.shape[1]}")

print(f"\n Creating embeddings for {len(all_chunks)} chunks...")

# --- Create embeddings in batches ---
def create_embeddings_batch(texts, model, batch_size=500):
    all_embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size

    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]

        print(f"   Processing batch {i+1}/{total_batches} ({start_idx}-{end_idx})")
        batch_embeddings = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
        all_embeddings.append(batch_embeddings.astype('float32'))  # Ensure float32 for FAISS

        del batch_texts, batch_embeddings
        if i % 2 == 0:
            gc.collect()

    return np.vstack(all_embeddings)

# Create embeddings
chunk_embeddings = create_embeddings_batch(all_chunks, embedding_model, batch_size=500)
print(f"Created embeddings: {chunk_embeddings.shape}")

# --- Create FAISS index ---
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product (cosine similarity after normalization)

# Normalize vectors for cosine similarity
faiss.normalize_L2(chunk_embeddings)
index.add(chunk_embeddings)

print("FAISS index created successfully!")
print(f"Index contains {index.ntotal} vectors of dimension {index.d}")


🔤 CREATING EMBEDDINGS AND VECTOR STORE
Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Testing embedding model...
Embedding model working - vector dimension: 384

 Creating embeddings for 29713 chunks...
   Processing batch 1/60 (0-500)
   Processing batch 2/60 (500-1000)
   Processing batch 3/60 (1000-1500)
   Processing batch 4/60 (1500-2000)
   Processing batch 5/60 (2000-2500)
   Processing batch 6/60 (2500-3000)
   Processing batch 7/60 (3000-3500)
   Processing batch 8/60 (3500-4000)
   Processing batch 9/60 (4000-4500)
   Processing batch 10/60 (4500-5000)
   Processing batch 11/60 (5000-5500)
   Processing batch 12/60 (5500-6000)
   Processing batch 13/60 (6000-6500)
   Processing batch 14/60 (6500-7000)
   Processing batch 15/60 (7000-7500)
   Processing batch 16/60 (7500-8000)
   Processing batch 17/60 (8000-8500)
   Processing batch 18/60 (8500-9000)
   Processing batch 19/60 (9000-9500)
   Processing batch 20/60 (9500-10000)
   Processing batch 21/60 (10000-10500)
   Processing batch 22/60 (10500-11000)
   Processing batch 23/60 (11000-11500)
   Processing ba

In [30]:
print("SAVING VECTOR STORE")
print("=" * 50)

import os
import pickle
import pandas as pd
import gc

# Create directory if not exists
os.makedirs("medical_rag/vector_store", exist_ok=True)

# Save FAISS index
faiss_index_path = "medical_rag/vector_store/medical_faiss.index"
faiss.write_index(index, faiss_index_path)
print(f"FAISS index saved to: {faiss_index_path}")

# Save metadata and chunks (essential info)
medical_specialties = list(set(m.get('medical_specialty', 'N/A') for m in chunk_metadata))

vector_store_data = {
    'chunks': all_chunks,
    'metadata': chunk_metadata,
    'embedding_model_name': 'sentence-transformers/all-MiniLM-L6-v2',
    'total_chunks': len(all_chunks),
    'medical_specialties': medical_specialties
}

vector_metadata_path = "medical_rag/vector_store/vector_metadata.pkl"
with open(vector_metadata_path, "wb") as f:
    pickle.dump(vector_store_data, f)
print(f"Vector store metadata saved to: {vector_metadata_path}")

# Save chunk metadata CSV for reference
chunk_info_df = pd.DataFrame(chunk_metadata)
chunk_info_path = "medical_rag/data/chunk_metadata.csv"
chunk_info_df.to_csv(chunk_info_path, index=False)
print(f"Chunk metadata saved to: {chunk_info_path}")

# Free memory
del chunk_embeddings, all_chunks, chunk_metadata, chunk_info_df
gc.collect()

# Summary
print(f"\n VECTOR STORE SUMMARY:")
print(f"   Total chunks: {vector_store_data['total_chunks']}")
print(f"   Vector dimension: {dimension}")
print(f"   Medical specialties: {len(vector_store_data['medical_specialties'])}")
print(f"   Saved location: medical_rag/vector_store/")


SAVING VECTOR STORE
FAISS index saved to: medical_rag/vector_store/medical_faiss.index
Vector store metadata saved to: medical_rag/vector_store/vector_metadata.pkl
Chunk metadata saved to: medical_rag/data/chunk_metadata.csv

 VECTOR STORE SUMMARY:
   Total chunks: 29713
   Vector dimension: 384
   Medical specialties: 39
   Saved location: medical_rag/vector_store/


In [31]:
print("CREATING MEDICAL RAG RETRIEVAL SYSTEM")
print("=" * 50)

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle

class MedicalRAGSystem:
    def __init__(self, vector_store_path="medical_rag/vector_store"):
        self.vector_store_path = vector_store_path
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.load_vector_store()

    def load_vector_store(self):
        """Load the FAISS index and metadata"""
        try:
            self.index = faiss.read_index(f"{self.vector_store_path}/medical_faiss.index")
            with open(f"{self.vector_store_path}/vector_metadata.pkl", "rb") as f:
                data = pickle.load(f)
            self.chunks = data.get('chunks', [])
            self.metadata = data.get('metadata', [])
            specialties = set(m.get('medical_specialty', 'N/A') for m in self.metadata)
            print("Vector store loaded successfully!")
            print(f"   Available chunks: {len(self.chunks)}")
            print(f"   Medical specialties: {len(specialties)}")
        except Exception as e:
            print(f"Error loading vector store: {e}")
            raise

    def retrieve_similar_chunks(self, query, k=5, specialty_filter=None):
        """Retrieve similar medical chunks for a query"""
        # Encode query
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        faiss.normalize_L2(query_embedding)

        # Search (get extra for potential filtering)
        scores, indices = self.index.search(query_embedding, k*3)

        results = []
        seen_chunks = set()

        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks) and idx not in seen_chunks:
                metadata = self.metadata[idx] if idx < len(self.metadata) else {}
                chunk_data = {
                    'content': self.chunks[idx],
                    'metadata': metadata,
                    'similarity_score': float(score)
                }

                # Apply specialty filter if provided
                specialty_name = metadata.get('medical_specialty', '')
                if specialty_filter:
                    if specialty_filter.lower() in specialty_name.lower():
                        results.append(chunk_data)
                        seen_chunks.add(idx)
                else:
                    results.append(chunk_data)
                    seen_chunks.add(idx)

                if len(results) >= k:
                    break

        return results[:k]

# --- Initialize RAG system ---
print(" Initializing Medical RAG System...")
medical_rag = MedicalRAGSystem()

# --- Test the retrieval ---
print("\n TESTING RAG RETRIEVAL")
print("=" * 30)

test_queries = [
    "What are the symptoms of allergic rhinitis?",
    "How is asthma treated?",
    "What are common allergy medications?"
]

for query in test_queries:
    print(f"\n Query: '{query}'")
    results = medical_rag.retrieve_similar_chunks(query, k=2)
    print(f"Found {len(results)} relevant chunks:")

    for i, result in enumerate(results):
        specialty = result['metadata'].get('medical_specialty', 'N/A')
        score = result['similarity_score']
        preview = result['content'][:80].replace('\n', ' ')
        print(f"   {i+1}. {specialty} (score: {score:.3f})")
        print(f"      {preview}...")


CREATING MEDICAL RAG RETRIEVAL SYSTEM
 Initializing Medical RAG System...
Vector store loaded successfully!
   Available chunks: 29713
   Medical specialties: 39

 TESTING RAG RETRIEVAL

 Query: 'What are the symptoms of allergic rhinitis?'
Found 2 relevant chunks:
   1. General Medicine (score: 0.561)
      mately 5 to 5:30 p.m.  He is involved in training purpose to how to sell managed...
   2. Sleep Medicine (score: 0.561)
      mately 5 to 5:30 p.m.  He is involved in training purpose to how to sell managed...

 Query: 'How is asthma treated?'
Found 2 relevant chunks:
   1. Pediatrics - Neonatal (score: 0.496)
      e is no smoke exposure there is a significant family history with both Abc's fat...
   2. Letters (score: 0.496)
      e is no smoke exposure there is a significant family history with both Abc's fat...

 Query: 'What are common allergy medications?'
Found 2 relevant chunks:
   1. SOAP / Chart / Progress Notes (score: 0.637)
      SUBJECTIVE:,  This 23-year-old white fe

In [35]:
import os
genai.configure(api_key=os.environ.get("AIzaSyCS08G9sKCceHkCHZdpBVV9fS_RoQSjPyM"))



In [36]:
print("INTEGRATING GEMINI FOR ANSWER GENERATION")
print("=" * 50)

import google.generativeai as genai
import os

# Configure Gemini (use environment variable for API key)
genai.configure(api_key=os.environ.get("GENIE_API_KEY"))
MODEL_NAME = "models/gemini-2.0-flash"

def generate_medical_answer(query, context_chunks):
    """Generate answer using Gemini with retrieved context"""
    if not context_chunks:
        return "I couldn't find relevant medical information to answer this question."

    # Prepare context from retrieved chunks
    context_text = "\n\n".join([
        f"--- SOURCE {i+1} (Medical Specialty: {chunk['metadata'].get('medical_specialty', 'N/A')}) ---\n{chunk['content']}"
        for i, chunk in enumerate(context_chunks)
    ])

    prompt = f"""You are a helpful medical assistant. Based ONLY on the following medical context from clinical notes, provide a accurate and helpful answer to the user's question.

MEDICAL CONTEXT:
{context_text}

USER QUESTION: {query}

IMPORTANT INSTRUCTIONS:
- Answer based ONLY on the provided medical context
- If the context doesn't contain relevant information, say "I cannot find specific information about this in the available medical records"
- Be precise and medically accurate
- Do not make up or hallucinate information
- Mention which medical specialty the information comes from when relevant

ANSWER:"""

    try:
        model = genai.GenerativeModel(MODEL_NAME)
        response = model.generate_content(prompt)
        # Safely get text output
        return getattr(response, 'text', getattr(response, 'output_text', 'No text returned'))
    except Exception as e:
        return f"Error generating answer: {str(e)}"

# --- Test complete RAG pipeline ---
print(" TESTING COMPLETE RAG PIPELINE")
print("=" * 30)

test_queries = [
    "What treatments are available for allergies?",
    "What are the symptoms of asthma?",
    "How is allergic rhinitis diagnosed?"
]

for query in test_queries:
    print(f"\nQUERY: {query}")
    print("-" * 50)

    # Step 1: Retrieve relevant chunks
    retrieved_chunks = medical_rag.retrieve_similar_chunks(query, k=3)
    print(f"Retrieved {len(retrieved_chunks)} medical chunks")

    # Step 2: Generate answer
    answer = generate_medical_answer(query, retrieved_chunks)

    print(f" ANSWER:")
    print(answer)
    print("-" * 50)

print(f"\n MEDICAL RAG SYSTEM COMPLETE!")
print("✅ Text chunking → ✅ Vector store → ✅ Retrieval → ✅ Answer generation")
print(f"System contains {len(medical_rag.chunks)} medical chunks across {len(set(m.get('medical_specialty', 'N/A') for m in medical_rag.metadata))} specialties")


INTEGRATING GEMINI FOR ANSWER GENERATION
 TESTING COMPLETE RAG PIPELINE

QUERY: What treatments are available for allergies?
--------------------------------------------------
Retrieved 3 medical chunks
 ANSWER:
Error generating answer: 
  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for more information.
--------------------------------------------------

QUERY: What are the symptoms of asthma?
--------------------------------------------------
Retrieved 3 medical chunks
 ANSWER:
Error generating answer: 
  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for mor

In [37]:
# Step 1: Create the basic file structure
print("CREATING CORRECT FILE STRUCTURE")
print("=" * 50)

import os

# Create directories
os.makedirs(".streamlit", exist_ok=True)
os.makedirs("medical_rag/vector_store", exist_ok=True)

print("✅ Directories created")

CREATING CORRECT FILE STRUCTURE
✅ Directories created


In [38]:
import os

# Ensure .streamlit folder exists
os.makedirs(".streamlit", exist_ok=True)

# Config content
config_content = """[server]
headless = true
address = "0.0.0.0"
port = 8501

[browser]
gatherUsageStats = false

[theme]
primaryColor = "#1f77b4"
backgroundColor = "#ffffff"
secondaryBackgroundColor = "#f0f2f6"
textColor = "#262730"
font = "sans serif"
"""

# Write to config.toml
with open(".streamlit/config.toml", "w") as f:
    f.write(config_content)

print("✅ Created: .streamlit/config.toml")


✅ Created: .streamlit/config.toml


In [39]:
# Step 3: Create requirements.txt
requirements_content = """streamlit==1.28.0
google-generativeai==0.3.2
sentence-transformers==2.2.2
faiss-cpu==1.7.4
pandas==2.0.3
numpy==1.24.3
python-dotenv==1.0.0
"""

with open("requirements.txt", "w") as f:
    f.write(requirements_content)
print("✅ Created: requirements.txt")

✅ Created: requirements.txt


In [40]:
# Step 4: Create medical_rag_system.py
rag_system_code = """import faiss
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
import os

class MedicalRAGSystem:
    def __init__(self, vector_store_path="medical_rag/vector_store"):
        self.vector_store_path = vector_store_path
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.load_vector_store()

    def load_vector_store(self):
        try:
            possible_paths = [
                self.vector_store_path,
                "medical_rag/vector_store",
                "./medical_rag/vector_store"
            ]

            loaded = False
            for path in possible_paths:
                try:
                    index_path = f"{path}/medical_faiss.index"
                    metadata_path = f"{path}/vector_metadata.pkl"

                    if os.path.exists(index_path) and os.path.exists(metadata_path):
                        self.index = faiss.read_index(index_path)
                        with open(metadata_path, "rb") as f:
                            data = pickle.load(f)
                        self.chunks = data['chunks']
                        self.metadata = data['metadata']
                        print(f"✅ Vector store loaded from: {path}")
                        loaded = True
                        break
                except Exception as e:
                    continue

            if not loaded:
                raise Exception("Could not load vector store from any path")

        except Exception as e:
            raise Exception(f"Error loading vector store: {str(e)}")

    def retrieve_similar_chunks(self, query, k=5):
        try:
            query_embedding = self.embedding_model.encode([query])
            faiss.normalize_L2(query_embedding)

            scores, indices = self.index.search(query_embedding, k*3)

            results = []
            seen_chunks = set()

            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.chunks) and idx not in seen_chunks:
                    chunk_data = {
                        'content': self.chunks[idx],
                        'metadata': self.metadata[idx],
                        'similarity_score': float(score)
                    }
                    results.append(chunk_data)
                    seen_chunks.add(idx)

                    if len(results) >= k:
                        break

            return results[:k]

        except Exception as e:
            print(f"Error in retrieval: {e}")
            return []
"""

with open("medical_rag_system.py", "w") as f:
    f.write(rag_system_code)
print("✅ Created: medical_rag_system.py")

✅ Created: medical_rag_system.py


In [41]:
# Step 5: Create app.py (main Streamlit app)
app_code = '''import streamlit as st
import google.generativeai as genai
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import pandas as pd
import os
import sys

st.set_page_config(
    page_title="Medical RAG Assistant",
    page_icon="🏥",
    layout="wide"
)

st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .info-box {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 1rem 0;
    }
    .source-box {
        background-color: #e8f4fd;
        padding: 0.5rem;
        border-radius: 0.3rem;
        margin: 0.5rem 0;
        border-left: 4px solid #1f77b4;
    }
</style>
""", unsafe_allow_html=True)

def initialize_rag_system():
    try:
        from medical_rag_system import MedicalRAGSystem
        rag_system = MedicalRAGSystem()
        return rag_system, None
    except Exception as e:
        return None, f"Error initializing RAG system: {str(e)}"

def generate_medical_answer(query, context_chunks, api_key):
    if not context_chunks:
        return "I couldn\'t find relevant medical information to answer this question in the available records."

    context_text = "\\n\\n".join([
        f"--- MEDICAL NOTE {i+1} (Specialty: {chunk[\'metadata\'][\'medical_specialty\']}) ---\\n{chunk[\'content\']}"
        for i, chunk in enumerate(context_chunks)
    ])

    prompt = f"""You are a medical research assistant. Answer the question based ONLY on the provided medical context from clinical notes.

MEDICAL CONTEXT:
{context_text}

QUESTION: {query}

IMPORTANT INSTRUCTIONS:
- Answer using ONLY the information from the medical context above
- If the context doesn\'t contain relevant information, say "I cannot find specific information about this in the available medical records"
- Be precise and medically accurate
- Do not make up or hallucinate information
- Mention which medical specialty the information comes from when relevant
- Keep answers concise but informative

ANSWER:"""

    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel("models/gemini-2.0-flash")
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating answer: {str(e)}"

st.markdown(\'<div class="main-header">🏥 Medical RAG Assistant</div>\', unsafe_allow_html=True)
st.markdown("**Ask medical questions based on 3,898 clinical transcriptions across 39 medical specialties**")

with st.sidebar:
    st.header("⚙️ Configuration")

    api_key = st.text_input(
        "Google AI Studio API Key",
        type="password",
        help="Get free API key from https://aistudio.google.com/"
    )

    st.markdown(\'<div class="info-box">\', unsafe_allow_html=True)
    st.write("**How to get API Key:**")
    st.write("1. Go to [Google AI Studio](https://aistudio.google.com/)")
    st.write("2. Sign in with Google account")
    st.write("3. Click \'Get API Key\' and create new key")
    st.write("4. Paste the key here")
    st.markdown(\'</div>\', unsafe_allow_html=True)

    if st.button("🚀 Initialize Medical RAG System", use_container_width=True):
        if not api_key:
            st.error("Please enter your Google AI Studio API key first")
        else:
            with st.spinner("Loading medical database..."):
                rag_system, error = initialize_rag_system()
                if rag_system:
                    st.session_state.rag_system = rag_system
                    st.session_state.api_key = api_key
                    st.success("✅ Medical RAG System Ready!")
                    st.write(f"• Medical chunks: {len(rag_system.chunks):,}")
                    st.write(f"• Specialties: {len(set(m[\'medical_specialty\'] for m in rag_system.metadata))}")
                    st.write(f"• Vector dimension: {rag_system.index.d}")
                else:
                    st.error(f"❌ {error}")

if \'rag_system\' not in st.session_state:
    st.session_state.rag_system = None
if \'history\' not in st.session_state:
    st.session_state.history = []

if st.session_state.rag_system:
    st.header("💬 Medical Question & Answer")

    query = st.text_input(
        "Ask your medical question:",
        placeholder="e.g., What are common treatments for allergies? What symptoms indicate asthma?",
        key="query_input"
    )

    col1, col2 = st.columns([1, 4])
    with col1:
        num_chunks = st.slider("Sources to retrieve", 1, 5, 3)

    if query and st.session_state.get(\'api_key\'):
        with st.spinner("🔍 Searching medical database..."):
            chunks = st.session_state.rag_system.retrieve_similar_chunks(query, k=num_chunks)
            answer = generate_medical_answer(query, chunks, st.session_state.api_key)

            st.session_state.history.append({
                \'query\': query,
                \'answer\': answer,
                \'chunks_used\': len(chunks),
                \'timestamp\': pd.Timestamp.now()
            })

        st.subheader("💡 Answer:")
        st.write(answer)

        with st.expander(f"📚 View Source Documents ({len(chunks)} found)"):
            for i, chunk in enumerate(chunks):
                st.markdown(\'<div class="source-box">\', unsafe_allow_html=True)
                st.write(f"**Source {i+1}** | **Specialty:** {chunk[\'metadata\'][\'medical_specialty\']} | **Similarity Score:** {chunk[\'similarity_score\']:.3f}")
                st.write(f"**Content:** {chunk[\'content\'][:400]}...")
                st.markdown(\'</div>\', unsafe_allow_html=True)

    if st.session_state.history:
        st.subheader("📜 Recent Questions")
        for i, item in enumerate(reversed(st.session_state.history[-3:])):
            st.write(f"**Q:** {item[\'query\']}")
            st.write(f"**A:** {item[\'answer\'][:200]}...")
            st.write(f"*Sources used: {item[\'chunks_used\']}*")
            st.divider()

else:
    st.info("👋 Welcome! Please enter your Google AI Studio API key and initialize the system in the sidebar to start asking medical questions.")

with st.expander("ℹ️ System Information"):
    st.write("""
    **Medical RAG System Overview:**

    - **Data Source:** 3,898 clinical medical transcription records
    - **Medical Content:** 29,713 processed text chunks
    - **Specialties Covered:** 39 different medical specialties
    - **Search Technology:** FAISS vector similarity search
    - **AI Model:** Google Gemini for answer generation
    - **Key Feature:** Provides source citations for transparency

    **How it works:**
    1. Your question is converted to a vector embedding
    2. System finds the most similar medical text chunks
    3. Gemini generates an answer using only the retrieved context
    4. Sources are provided for verification

    **Note:** This system provides information from medical records but is not a substitute for professional medical advice.
    """)

st.markdown("---")
st.markdown("*Built with Streamlit, FAISS, and Google Gemini • Medical RAG System*")
'''

with open("app.py", "w") as f:
    f.write(app_code)
print("✅ Created: app.py")

✅ Created: app.py


In [43]:
# Step 6: Create README.md (Fixed)
print("CREATING README.MD")
print("=" * 50)

readme_content = "# Medical RAG Assistant\n\n"
readme_content += "A Retrieval-Augmented Generation (RAG) system for medical question answering, built with Streamlit, FAISS, and Google Gemini.\n\n"
readme_content += "## Features\n\n"
readme_content += "- **29,713 medical text chunks** from 3,898 clinical transcriptions\n"
readme_content += "- **39 medical specialties** covered\n"
readme_content += "- **Semantic search** using FAISS vector database\n"
readme_content += "- **AI-powered answers** using Google Gemini\n"
readme_content += "- **Source citation** for transparency\n"
readme_content += "- **Web interface** with Streamlit\n\n"
readme_content += "##  Quick Start\n\n"
readme_content += "1. **Get API Key**: Free from [Google AI Studio](https://aistudio.google.com/)\n"
readme_content += "2. **Enter API Key**: In the app sidebar\n"
readme_content += "3. **Initialize System**: Click \"Initialize Medical RAG System\"\n"
readme_content += "4. **Ask Questions**: Type your medical questions\n\n"
readme_content += "##  Project Structure\n\n"
readme_content += "```\n"
readme_content += "medical-rag-assistant/\n"
readme_content += "├── app.py                          # Main Streamlit app\n"
readme_content += "├── medical_rag_system.py           # RAG system module\n"
readme_content += "├── requirements.txt                # Dependencies\n"
readme_content += "├── .streamlit/\n"
readme_content += "│   └── config.toml                 # Streamlit configuration\n"
readme_content += "└── medical_rag/\n"
readme_content += "    └── vector_store/               # Vector database\n"
readme_content += "        ├── medical_faiss.index\n"
readme_content += "        └── vector_metadata.pkl\n"
readme_content += "```\n\n"
readme_content += "##Medical Disclaimer\n\n"
readme_content += "This system provides information from medical records for educational purposes only. It is not a substitute for professional medical advice.\n"

with open("README.md", "w") as f:
    f.write(readme_content)

print("Created: README.md")
print("ALL FILES CREATED SUCCESSFULLY!")

CREATING README.MD
Created: README.md
ALL FILES CREATED SUCCESSFULLY!


In [44]:
# Step 7: Check created files
print("CHECKING CREATED FILES")
print("=" * 50)

import os

files_to_check = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md",
    ".streamlit/config.toml"
]

print("Checking deployment files:")
for file in files_to_check:
    if os.path.exists(file):
        size = os.path.getsize(file) / 1024  # Size in KB
        print(f"{file} ({size:.1f} KB)")
    else:
        print(f"{file} - MISSING")

print("\nChecking vector store files:")
vector_files = [
    "medical_rag/vector_store/medical_faiss.index",
    "medical_rag/vector_store/vector_metadata.pkl"
]

for file in vector_files:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024 * 1024)
        print(f"{file} ({size_mb:.1f} MB)")
    else:
        print(f"{file} - MISSING (Critical for app to work)")

print("\nFINAL FILE STRUCTURE:")
for root, dirs, files in os.walk("."):
    if '.git' in root or '__pycache__' in root:
        continue
    level = root.replace(".", "").count(os.sep)
    indent = " " * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 2 * (level + 1)
    for file in files:
        if any(file.endswith(ext) for ext in ['.py', '.txt', '.toml', '.md', '.index', '.pkl']):
            size_kb = os.path.getsize(os.path.join(root, file)) / 1024
            print(f"{sub_indent}{file} ({size_kb:.1f} KB)")

CHECKING CREATED FILES
Checking deployment files:
app.py (7.0 KB)
medical_rag_system.py (2.5 KB)
requirements.txt (0.1 KB)
README.md (1.3 KB)
.streamlit/config.toml (0.2 KB)

Checking vector store files:
medical_rag/vector_store/medical_faiss.index (43.5 MB)
medical_rag/vector_store/vector_metadata.pkl (16.6 MB)

FINAL FILE STRUCTURE:
./
  medical_rag_system.py (2.5 KB)
  app.py (7.0 KB)
  requirements.txt (0.1 KB)
  README.md (1.3 KB)
  .config/
    logs/
      2025.11.20/
    configurations/
  medical_rag/
    data/
    vector_store/
      medical_faiss.index (44569.5 KB)
      vector_metadata.pkl (16957.9 KB)
  .streamlit/
    config.toml (0.2 KB)
  sample_data/
    README.md (0.9 KB)


In [46]:
# Step 1: Create a zip file of all deployment files
print("CREATING DEPLOYMENT PACKAGE")
print("=" * 50)

import shutil

# Create a zip file with all necessary files
shutil.make_archive("medical_rag_assistant", 'zip', '.')

print("Created: medical_rag_assistant.zip")
print("Download this zip file to your computer:")
print("   Files → medical_rag_assistant.zip → Download")

CREATING DEPLOYMENT PACKAGE


OSError: [Errno 28] No space left on device

In [None]:
print(" VERIFYING FILE SYSTEM STATE BEFORE ZIPPING")
print("=" * 50)

import os

# Files expected at root
expected_root_files = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md"
]

# Directory with config.toml
expected_streamlit_dir = ".streamlit"
expected_streamlit_config = ".streamlit/config.toml"

# Vector store directory
expected_vector_store_dir = "medical_rag/vector_store"

all_found = True

print("\n--- Checking root files ---")
for f in expected_root_files:
    if os.path.exists(f):
        print(f"✅ Found: {f}")
    else:
        print(f"❌ NOT Found: {f}")
        all_found = False

print("\n--- Checking .streamlit directory ---")
if os.path.isdir(expected_streamlit_dir):
    print(f"✅ Found directory: {expected_streamlit_dir}/")
    if os.path.exists(expected_streamlit_config):
        print(f"  ✅ Found config: {expected_streamlit_config}")
    else:
        print(f"  ❌ NOT Found config: {expected_streamlit_config}")
        all_found = False
else:
    print(f"❌ NOT Found directory: {expected_streamlit_dir}/")
    all_found = False

print("\n--- Checking medical_rag/vector_store directory ---")
if os.path.isdir(expected_vector_store_dir):
    print(f"✅ Found directory: {expected_vector_store_dir}/")
    vector_store_contents = os.listdir(expected_vector_store_dir)
    if vector_store_contents:
        for item in vector_store_contents:
            print(f"  ✅ Found item: {expected_vector_store_dir}/{item}")
    else:
        print(f"  ❌ Directory is empty: {expected_vector_store_dir}/")
        all_found = False
else:
    print(f"❌ NOT Found directory: {expected_vector_store_dir}/")
    all_found = False

if all_found:
    print("\n All essential files and directories appear to be present. Proceeding to re-zip.")
else:
    print("\n Some essential files/directories are missing. Please re-run previous steps to ensure they are created.")



In [None]:
import zipfile
import os

print("CREATING ESSENTIAL DEPLOYMENT PACKAGE (FIXED)")
print("=" * 50)

output_zip_filename = "medical_rag_assistant_essential.zip"

# List of individual files to include (these are expected at the root of the repo)
files_to_include = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md",
]

# Specific handling for .streamlit/config.toml
streamlit_config_path = ".streamlit/config.toml"

# Directory to include recursively, preserving its full path relative to the root
directories_to_include = [
    "medical_rag/vector_store"
]

with zipfile.ZipFile(output_zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add individual files at the root level of the zip
    for file_path in files_to_include:
        if os.path.exists(file_path):
            zipf.write(file_path, file_path) # arcname is the same as file_path to keep it at root
            print(f"   Added file: {file_path}")
        else:
            print(f"   Warning: File not found and skipped: {file_path}. Please ensure it exists.")

    # Add .streamlit/config.toml, preserving its subdirectory structure
    if os.path.exists(streamlit_config_path):
        zipf.write(streamlit_config_path, streamlit_config_path)
        print(f"   Added file: {streamlit_config_path}")
    else:
        print(f"   Warning: File not found and skipped: {streamlit_config_path}. Please ensure it exists.")

    # Add contents of directories recursively, preserving their full paths
    for dir_path in directories_to_include:
        if os.path.isdir(dir_path):
            for root, _, files in os.walk(dir_path):
                for file in files:
                    full_file_path = os.path.join(root, file)
                    # The arcname should be the same as full_file_path to preserve the directory structure inside the zip
                    zipf.write(full_file_path, full_file_path)
                    print(f"   Added directory content: {full_file_path}")
        else:
            print(f"   Warning: Directory not found and skipped: {dir_path}. Please ensure it exists.")

print(f"✅ Created essential package: {output_zip_filename}")
print("Download this zip file to your computer:")
print(f"   Files → {output_zip_filename} → Download")


📦 CREATING ESSENTIAL DEPLOYMENT PACKAGE (FIXED)
✅ Created essential package: medical_rag_assistant_essential.zip
📁 Download this zip file to your computer:
   Files → medical_rag_assistant_essential.zip → Download


In [None]:
# Step 4: Streamlit Deployment Instructions
print(" STREAMLIT CLOUD DEPLOYMENT")
print("=" * 50)

deployment_steps = """
1. **GO TO STREAMLIT CLOUD:**
   - Visit: https://share.streamlit.io/
   - Sign in with your GitHub account

2. **CREATE NEW APP:**
   - Click "New app"
   - Repository: your-username/medical-rag-assistant
   - Branch: main
   - Main file path: app.py
   - Click "Deploy"

3. **WAIT FOR DEPLOYMENT:**
   - Initial deployment takes 2-5 minutes
   - Watch the logs for any errors
   - If successful, you'll get a URL like:
     https://medical-rag-assistant.streamlit.app/

4. **TEST YOUR APP:**
   - Open your app URL
   - In sidebar, enter Google AI Studio API key
   - Click "Initialize Medical RAG System"
   - Start asking medical questions!

 YOUR MEDICAL RAG ASSISTANT WILL BE LIVE!
"""

print(deployment_steps)

🌐 STREAMLIT CLOUD DEPLOYMENT

1. **GO TO STREAMLIT CLOUD:**
   - Visit: https://share.streamlit.io/
   - Sign in with your GitHub account

2. **CREATE NEW APP:**
   - Click "New app"
   - Repository: your-username/medical-rag-assistant
   - Branch: main
   - Main file path: app.py
   - Click "Deploy"

3. **WAIT FOR DEPLOYMENT:**
   - Initial deployment takes 2-5 minutes
   - Watch the logs for any errors
   - If successful, you'll get a URL like: 
     https://medical-rag-assistant.streamlit.app/

4. **TEST YOUR APP:**
   - Open your app URL
   - In sidebar, enter Google AI Studio API key
   - Click "Initialize Medical RAG System"
   - Start asking medical questions!

🎉 YOUR MEDICAL RAG ASSISTANT WILL BE LIVE!



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
