# Medical Misinformation Detection - Google Colab

## Setup Instructions

1. Upload this notebook to Google Colab
2. Upload your project files (see Step 2)
3. Enable GPU: Runtime → Change runtime type → GPU
4. Run all cells in order

## Workflow

1. Download data
2. Process and label data
3. Train all models (ML, DL, Transformer)


## Step 1: Setup Environment


In [None]:
# Install required packages
!pip install -q pandas numpy requests beautifulsoup4 scikit-learn matplotlib seaborn torch transformers datasets tqdm kaggle

# Verify GPU availability
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("GPU not detected. Enable GPU in Runtime → Change runtime type")
    print("Training will be slower on CPU.")


## Step 2: Upload Project Files

Upload a ZIP or RAR file containing:
- `data_downloader.py`
- `process_and_label_data.py`
- `train_all_models.py`
- `utils/` folder
- `api_keys.py` (optional, for Kaggle API)
- `disease_symptoms.csv` (optional)


In [None]:
from google.colab import files
import zipfile
import os
import subprocess

print("=" * 60)
print("Upload Project File (ZIP or RAR)")
print("=" * 60)

uploaded = files.upload()

extracted = False
for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    
    if filename.endswith('.zip'):
        print("Extracting ZIP file...")
        try:
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall('.')
            print("ZIP extraction complete")
            extracted = True
        except Exception as e:
            print(f"Error extracting ZIP: {e}")
    
    elif filename.endswith('.rar'):
        print("Extracting RAR file...")
        try:
            subprocess.run(['apt-get', 'update'], check=False, capture_output=True)
            subprocess.run(['apt-get', 'install', '-y', 'unrar'], check=False, capture_output=True)
            
            result = subprocess.run(['unrar', 'x', filename, '-y'], capture_output=True, text=True)
            
            if result.returncode == 0:
                print("RAR extraction complete")
                extracted = True
            else:
                print(f"Error extracting RAR: {result.stderr[:200]}")
        except Exception as e:
            print(f"Error: {e}")
    
    else:
        print(f"Unknown file type: {filename}")

if not extracted:
    print("\nNo files were extracted. Check your file format.")

print("\n" + "=" * 60)
print("Looking for project files...")
print("=" * 60)

try:
    current_files = os.listdir('.')
    print(f"Files in current directory: {len(current_files)} items")
except Exception as e:
    print(f"Error listing directory: {e}")
    current_files = []

required_files = ['data_downloader.py', 'process_and_label_data.py', 'train_all_models.py']
found_files = [f for f in required_files if os.path.exists(f)]

if len(found_files) == len(required_files):
    print("All required files found in current directory")
    print(f"Current directory: {os.getcwd()}")
else:
    print(f"Required files not in current directory")
    print(f"Found: {found_files}")
    print(f"Missing: {[f for f in required_files if f not in found_files]}")
    print("\nSearching for project directory...")
    
    possible_dirs = [
        'Medical Misinformation Detection',
        'Medical_Misinformation_Detection',
        'medical-misinformation-detection'
    ]
    
    found_dir = None
    for root, dirs, files_list in os.walk('.'):
        if all(f in files_list for f in required_files):
            found_dir = root
            print(f"Found project directory: {root}")
            break
    
    if not found_dir:
        for dir_name in possible_dirs:
            if os.path.exists(dir_name):
                if all(os.path.exists(os.path.join(dir_name, f)) for f in required_files):
                    found_dir = dir_name
                    print(f"Found project directory: {dir_name}")
                    break
    
    if found_dir:
        if found_dir != '.':
            os.chdir(found_dir)
        print(f"\nChanged to project directory: {found_dir}")
        print(f"Current directory: {os.getcwd()}")
        found_files = [f for f in required_files if os.path.exists(f)]
        if len(found_files) == len(required_files):
            print("All required files now found")
    else:
        print("\nProject directory not found automatically.")
        print("Check the file browser on the left")

print("\n" + "=" * 60)
print("Final File Check")
print("=" * 60)
for file in required_files:
    if os.path.exists(file):
        size = os.path.getsize(file)
        print(f"{file} ({size:,} bytes)")
    else:
        print(f"{file} - MISSING")
        for root, dirs, files_list in os.walk('.'):
            if file in files_list:
                print(f"  Found at: {os.path.join(root, file)}")

if os.path.exists('utils'):
    utils_files = os.listdir('utils')
    print(f"utils/ folder found ({len(utils_files)} files)")
else:
    print("utils/ folder - MISSING")
    for root, dirs, files_list in os.walk('.'):
        if 'utils' in dirs:
            utils_path = os.path.join(root, 'utils')
            print(f"  Found utils at: {utils_path}")

if os.path.exists('api_keys.py'):
    print(f"\napi_keys.py found")
else:
    print(f"\napi_keys.py not found (optional)")

print(f"\nCurrent working directory: {os.getcwd()}")
print("=" * 60)


## Step 1b: Setup Kaggle API

Reads Kaggle credentials from `api_keys.py` if included in Step 2. Skip if not using Kaggle.


In [None]:
from google.colab import files
import os
import json

print("=" * 60)
print("Kaggle API Setup")
print("=" * 60)

kaggle_configured = False

if os.path.exists('api_keys.py'):
    print("\nFound api_keys.py - checking for Kaggle credentials...")
    try:
        with open('api_keys.py', 'r') as f:
            content = f.read()
        
        username = None
        key = None
        
        for line in content.split('\n'):
            if 'KAGGLE_USERNAME' in line and '=' in line:
                username = line.split('=')[1].strip().strip('"').strip("'").strip()
            elif 'KAGGLE_KEY' in line and '=' in line:
                key = line.split('=')[1].strip().strip('"').strip("'").strip()
        
        if username and key:
            print(f"Found Kaggle credentials")
            print(f"Username: {username}")
            
            os.makedirs('/root/.kaggle', exist_ok=True)
            kaggle_config = {
                "username": username,
                "key": key
            }
            
            with open('/root/.kaggle/kaggle.json', 'w') as f:
                json.dump(kaggle_config, f)
            
            os.chmod('/root/.kaggle/kaggle.json', 600)
            os.chmod('/root/.kaggle', 700)
            
            print("Kaggle API configured from api_keys.py")
            kaggle_configured = True
        else:
            print("api_keys.py found but Kaggle credentials not found")
    except Exception as e:
        print(f"Error reading api_keys.py: {e}")
else:
    print("api_keys.py not found")

if not kaggle_configured:
    print("\nAlternative: Upload kaggle.json file directly")
    print("Get kaggle.json from: https://www.kaggle.com/account\n")
    
    try:
        uploaded = files.upload()
        
        for filename in uploaded.keys():
            if filename == 'kaggle.json':
                os.makedirs('/root/.kaggle', exist_ok=True)
                os.rename('kaggle.json', '/root/.kaggle/kaggle.json')
                os.chmod('/root/.kaggle/kaggle.json', 600)
                os.chmod('/root/.kaggle', 700)
                
                print("Kaggle API configured from uploaded file")
                kaggle_configured = True
                break
    except Exception as e:
        print(f"Upload skipped: {e}")

print("\n" + "=" * 60)
if kaggle_configured:
    print("Kaggle API Configured")
    print("Kaggle datasets will be downloaded in Step 4")
else:
    print("Kaggle API Not Configured")
    print("Kaggle datasets will be skipped")
print("=" * 60)


## Step 4: Download Data


In [None]:
import os
print(f"Current directory: {os.getcwd()}")
print(f"data_downloader.py exists: {os.path.exists('data_downloader.py')}\n")

kaggle_config = os.path.expanduser('~/.kaggle/kaggle.json')
if os.path.exists(kaggle_config):
    print("Kaggle API is configured")
else:
    print("Kaggle API not configured - Kaggle datasets will be skipped")

print("\n" + "=" * 60)
print("Starting Data Download")
print("=" * 60)
!python data_downloader.py


## Step 5: Process and Label Data


In [None]:
import os

print("=" * 60)
print("Checking Available Data")
print("=" * 60)

raw_data_path = 'data/processed/raw_downloaded_data.csv'
if os.path.exists(raw_data_path):
    size = os.path.getsize(raw_data_path)
    print(f"Raw data found: {size:,} bytes")
else:
    print("Raw data file not found")

data_dir = 'general_medical_misinformation_data'
if os.path.exists(data_dir):
    files = []
    for root, dirs, file_list in os.walk(data_dir):
        files.extend([os.path.join(root, f) for f in file_list])
    print(f"Found {len(files)} files in download directory")
else:
    print("Download directory not found")

processed_path = 'data/processed/medical_dataset.csv'
if os.path.exists(processed_path):
    size = os.path.getsize(processed_path)
    print(f"Existing processed dataset found: {size:,} bytes")
else:
    print("No existing processed dataset")

print("\n" + "=" * 60)
print("Starting Data Processing and Labeling")
print("=" * 60)
!python process_and_label_data.py


## Step 6: Train All Models

Trains ML models (Logistic Regression, Random Forest), DL models (CNN, LSTM), and Transformer (BioBERT).

May take 1-3 hours depending on dataset size and GPU.


In [None]:
import os
import pandas as pd

dataset_path = 'data/processed/medical_dataset.csv'
if os.path.exists(dataset_path):
    print(f"Found dataset: {dataset_path}")
    try:
        df_sample = pd.read_csv(dataset_path, nrows=5)
        print(f"Dataset columns: {list(df_sample.columns)}")
        full_df = pd.read_csv(dataset_path)
        print(f"Total rows: {len(full_df):,}")
        if len(full_df) == 0:
            print("ERROR: Dataset is empty")
            raise ValueError("Dataset is empty")
        print("Label distribution:")
        label_counts = full_df['label'].value_counts()
        for label, count in label_counts.items():
            print(f"  {label}: {count:,}")
    except Exception as e:
        print(f"Error reading dataset: {e}")
        raise
else:
    print(f"ERROR: Dataset not found: {dataset_path}")
    raise FileNotFoundError(f"Dataset not found: {dataset_path}")

print("\n" + "=" * 60)
print("Starting Model Training")
print("=" * 60)
!python train_all_models.py


## Step 7: Download Results


In [None]:
import zipfile
import os
from datetime import datetime
from google.colab import files

print("=" * 60)
print("Verifying All Models and Results")
print("=" * 60)

required_models = {
    'ML Models (2)': [
        'models/ml/logistic_regression.pkl',
        'models/ml/random_forest.pkl',
        'models/ml/tfidf_vectorizer.pkl',
        'models/ml/label_encoder.pkl'
    ],
    'DL Models (2)': [
        'models/dl/cnn_best.pt',
        'models/dl/lstm_best.pt'
    ],
    'Transformer Model (1)': [
        'models/transformer/biobert_final/config.json'
    ]
}

required_results = {
    'ML Results': [
        'results/ml/logistic_regression_metrics.json',
        'results/ml/logistic_regression_confusion_matrix.png',
        'results/ml/random_forest_metrics.json',
        'results/ml/random_forest_confusion_matrix.png'
    ],
    'DL Results': [
        'results/dl/cnn_metrics.json',
        'results/dl/cnn_confusion_matrix.png',
        'results/dl/cnn_curves.png',
        'results/dl/lstm_metrics.json',
        'results/dl/lstm_confusion_matrix.png',
        'results/dl/lstm_curves.png'
    ],
    'Transformer Results': [
        'results/transformer/biobert_metrics.json',
        'results/transformer/biobert_confusion_matrix.png'
    ]
}

missing_items = []
found_items = []

print("\nChecking Models:")
for category, files_list in required_models.items():
    print(f"\n{category}:")
    for file_path in files_list:
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            print(f"  ✓ {os.path.basename(file_path)} ({size:,} bytes)")
            found_items.append(file_path)
        else:
            print(f"  ✗ {os.path.basename(file_path)} - MISSING")
            missing_items.append(file_path)
    
    if category == 'Transformer Model (1)':
        transformer_dir = 'models/transformer/biobert_final'
        if os.path.exists(transformer_dir):
            transformer_files = os.listdir(transformer_dir)
            model_file = None
            for f in transformer_files:
                if f.endswith('.safetensors') or f.endswith('.bin'):
                    model_file = os.path.join(transformer_dir, f)
                    break
            if model_file and os.path.exists(model_file):
                size = os.path.getsize(model_file)
                print(f"  ✓ {os.path.basename(model_file)} ({size:,} bytes)")
                found_items.append(model_file)
            else:
                print(f"  ✗ Transformer model weights file - MISSING")
                if not model_file:
                    missing_items.append('models/transformer/biobert_final/model.safetensors or pytorch_model.bin')

print("\nChecking Results:")
for category, files_list in required_results.items():
    print(f"\n{category}:")
    for file_path in files_list:
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            print(f"  ✓ {os.path.basename(file_path)} ({size:,} bytes)")
            found_items.append(file_path)
        else:
            print(f"  ✗ {os.path.basename(file_path)} - MISSING")
            missing_items.append(file_path)

if missing_items:
    print("\n" + "=" * 60)
    print("WARNING: Some files are missing!")
    print("=" * 60)
    print(f"Missing {len(missing_items)} file(s):")
    for item in missing_items:
        print(f"  - {item}")
    print("\nThe ZIP will be created with available files only.")
else:
    print("\n" + "=" * 60)
    print("All Required Models and Results Found!")
    print("=" * 60)
    print("✓ 2 ML Models (Logistic Regression, Random Forest)")
    print("✓ 2 DL Models (CNN, LSTM)")
    print("✓ 1 Transformer Model (BioBERT)")
    print("✓ All Results Files")

print("\n" + "=" * 60)
print("Creating ZIP Archive")
print("=" * 60)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
zip_filename = f'medical_misinfo_results_{timestamp}.zip'

files_added = 0
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    if os.path.exists('models'):
        for root, dirs, file_list in os.walk('models'):
            for file in file_list:
                file_path = os.path.join(root, file)
                zipf.write(file_path)
                files_added += 1
    
    if os.path.exists('results'):
        for root, dirs, file_list in os.walk('results'):
            for file in file_list:
                file_path = os.path.join(root, file)
                zipf.write(file_path)
                files_added += 1
    
    if os.path.exists('data/processed/medical_dataset.csv'):
        zipf.write('data/processed/medical_dataset.csv')
        files_added += 1

zip_size = os.path.getsize(zip_filename) / (1024 * 1024)
print(f"\nCreated {zip_filename}")
print(f"Total files: {files_added}")
print(f"Archive size: {zip_size:.2f} MB")

print("\n" + "=" * 60)
print("Starting Download")
print("=" * 60)
files.download(zip_filename)
print("\nDownload initiated. Check your browser's download folder.")
print("=" * 60)


In [None]:
# Step 7: Save Outputs to Google Drive
print("\n" + "=" * 60)
print("Step 7: Save Outputs to Google Drive")
print("=" * 60)

from google.colab import drive
import os

drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive/Medical_Misinformation_Project'
os.makedirs(output_dir, exist_ok=True)

# Copy folders and key files
!cp -r models "/content/drive/MyDrive/Medical_Misinformation_Project/"
!cp -r results "/content/drive/MyDrive/Medical_Misinformation_Project/"
!cp data/processed/medical_dataset.csv "/content/drive/MyDrive/Medical_Misinformation_Project/"
!cp data/processed/top_myths.csv "/content/drive/MyDrive/Medical_Misinformation_Project/" || true
!cp data/processed/top_facts.csv "/content/drive/MyDrive/Medical_Misinformation_Project/" || true
!cp data/processed/qa_pairs_100.csv "/content/drive/MyDrive/Medical_Misinformation_Project/" || true
!cp data/processed/rag_vs_nonrag_comparison.csv "/content/drive/MyDrive/Medical_Misinformation_Project/" || true
!cp data/processed/rag_vs_nonrag_detailed.csv "/content/drive/MyDrive/Medical_Misinformation_Project/" || true

print("\nFiles saved to Google Drive -> Medical_Misinformation_Project")


In [None]:
# Step 8: RAG vs Non-RAG Evaluation
print("\n" + "=" * 60)
print("Step 8: RAG vs Non-RAG Evaluation")
print("=" * 60)

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

qa_path = 'data/processed/qa_pairs_100.csv'
kb_path = 'data/processed/medical_dataset.csv'

if not os.path.exists(qa_path) or not os.path.exists(kb_path):
    raise FileNotFoundError("QA pairs or medical_dataset.csv not found. Run Steps 4-5 first.")

qa_df = pd.read_csv(qa_path)
kb_df = pd.read_csv(kb_path)
credible_texts = kb_df[kb_df['label'] == 'credible']['text'].dropna().tolist()
if len(credible_texts) > 20000:
    credible_texts = credible_texts[:20000]

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
kb_matrix = vectorizer.fit_transform(credible_texts)

harmful_terms = ['drink bleach', 'poison', 'dangerous advice', 'no treatment needed']


def rag_answer(question, top_k=3):
    q_vec = vectorizer.transform([question])
    sims = cosine_similarity(q_vec, kb_matrix)[0]
    top_idx = sims.argsort()[-top_k:][::-1]
    retrieved = [credible_texts[i] for i in top_idx]
    answer = " ".join(retrieved)
    return answer, retrieved


def baseline_answer(question):
    answer = ("Further medical evaluation is required. Consult verified health sources "
              "such as WHO/CDC for precise guidance.")
    return answer, []


def evaluate_answer(pred, reference, retrieved):
    tokens_pred = set(pred.lower().split())
    tokens_ref = set(str(reference).lower().split())
    overlap = tokens_pred & tokens_ref
    precision = len(overlap) / (len(tokens_pred) + 1e-6)
    recall = len(overlap) / (len(tokens_ref) + 1e-6)
    f1 = (2 * precision * recall) / (precision + recall + 1e-6)
    factuality = f1
    completeness = recall
    if retrieved:
        ref_vec = vectorizer.transform([' '.join(retrieved)])
        pred_vec = vectorizer.transform([pred])
        faithfulness = float(cosine_similarity(ref_vec, pred_vec)[0][0])
    else:
        faithfulness = 0.0
    safety = 0 if any(term in pred.lower() for term in harmful_terms) else 1
    return {
        'factuality': factuality,
        'completeness': completeness,
        'faithfulness': min(1.0, faithfulness),
        'safety': safety
    }

rag_scores = []
baseline_scores = []
subset = qa_df.head(50)

for _, row in subset.iterrows():
    question = row['question']
    reference = row.get('answer', '')
    rag_pred, retrieved = rag_answer(question)
    baseline_pred, _ = baseline_answer(question)
    rag_scores.append(evaluate_answer(rag_pred, reference, retrieved))
    baseline_scores.append(evaluate_answer(baseline_pred, reference, []))

rag_avg = {k: np.mean([score[k] for score in rag_scores]) for k in rag_scores[0]}
baseline_avg = {k: np.mean([score[k] for score in baseline_scores]) for k in baseline_scores[0]}

print("RAG Metrics (averaged over sample):")
for metric, value in rag_avg.items():
    print(f"  {metric.title()}: {value:.4f}")

print("\nNon-RAG Baseline Metrics (averaged over sample):")
for metric, value in baseline_avg.items():
    print(f"  {metric.title()}: {value:.4f}")

print("\n" + "=" * 60)
print("Saving Comparison Results")
print("=" * 60)

# Save summary comparison
comparison_data = {
    'Model': ['RAG', 'Non-RAG (Baseline)'],
    'Factuality': [rag_avg['factuality'], baseline_avg['factuality']],
    'Completeness': [rag_avg['completeness'], baseline_avg['completeness']],
    'Faithfulness': [rag_avg['faithfulness'], baseline_avg['faithfulness']],
    'Safety': [rag_avg['safety'], baseline_avg['safety']]
}

comparison_df = pd.DataFrame(comparison_data)
comparison_path = 'data/processed/rag_vs_nonrag_comparison.csv'
comparison_df.to_csv(comparison_path, index=False)
print(f"✓ Saved summary comparison to {comparison_path}")

# Save detailed results for each QA pair
detailed_results = []
for idx, row in subset.iterrows():
    question = row['question']
    reference = row.get('answer', '')
    
    rag_pred, retrieved = rag_answer(question)
    baseline_pred, _ = baseline_answer(question)
    
    rag_scores = evaluate_answer(rag_pred, reference, retrieved)
    baseline_scores = evaluate_answer(baseline_pred, reference, [])
    
    detailed_results.append({
        'question': question,
        'reference_answer': reference,
        'rag_answer': rag_pred,
        'rag_factuality': rag_scores['factuality'],
        'rag_completeness': rag_scores['completeness'],
        'rag_faithfulness': rag_scores['faithfulness'],
        'rag_safety': rag_scores['safety'],
        'nonrag_answer': baseline_pred,
        'nonrag_factuality': baseline_scores['factuality'],
        'nonrag_completeness': baseline_scores['completeness'],
        'nonrag_faithfulness': baseline_scores['faithfulness'],
        'nonrag_safety': baseline_scores['safety']
    })

detailed_df = pd.DataFrame(detailed_results)
detailed_path = 'data/processed/rag_vs_nonrag_detailed.csv'
detailed_df.to_csv(detailed_path, index=False)
print(f"✓ Saved detailed results for {len(detailed_df)} QA pairs to {detailed_path}")

# Copy to Google Drive
from google.colab import drive
import shutil

try:
    drive.mount('/content/drive', force_remount=True)
    output_dir = '/content/drive/MyDrive/Medical_Misinformation_Project'
    os.makedirs(output_dir, exist_ok=True)
    
    for file in ['rag_vs_nonrag_comparison.csv', 'rag_vs_nonrag_detailed.csv']:
        src = f'data/processed/{file}'
        if os.path.exists(src):
            dst = os.path.join(output_dir, file)
            shutil.copy2(src, dst)
            print(f"✓ Copied {file} to Google Drive")
    
    print(f"\nAll RAG vs Non-RAG results saved to Google Drive -> Medical_Misinformation_Project")
except Exception as e:
    print(f"Note: Could not copy to Google Drive ({e})")
    print("Files are saved locally in data/processed/")

print("\n" + "=" * 60)
print("RAG vs Non-RAG Evaluation Complete")
print("=" * 60)
print("Files created:")
print("  - rag_vs_nonrag_comparison.csv (summary metrics)")
print("  - rag_vs_nonrag_detailed.csv (per-QA pair results)")
