## Step 1: Setup Environment & Verify GPU

In [None]:
# Install required packages
!pip install -q pandas numpy scikit-learn matplotlib seaborn torch transformers datasets tqdm joblib

# Verify GPU availability
import torch
print("=" * 60)
print("GPU CHECK")
print("=" * 60)
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print("\n‚úì GPU is ready for training!")
else:
    print("\n‚ö† WARNING: GPU not detected!")
    print("Enable GPU: Settings ‚Üí Accelerator ‚Üí GPU T4 x2")
    print("Training will be MUCH slower on CPU.")
print("=" * 60)

## Step 2: Access Project Files

**Option 1: Add Dataset (Recommended)**
- Click "+ Add Data" in the right panel
- Upload your dataset or search for existing ones
- Your files will be in `/kaggle/input/your-dataset-name/`

**Option 2: Direct File Access**
- If you've already added your dataset, this cell will find it automatically


In [None]:
import os
import shutil
import zipfile

print("=" * 60)
print("ACCESSING PROJECT FILES")
print("=" * 60)

# Look for Kaggle input directory
kaggle_input = '/kaggle/input'
source_dir = None

if os.path.exists(kaggle_input):
    # Find the dataset folder
    datasets = [d for d in os.listdir(kaggle_input) if os.path.isdir(os.path.join(kaggle_input, d))]
    if datasets:
        source_dir = os.path.join(kaggle_input, datasets[0])
        print(f"‚úì Found dataset: {datasets[0]}")
        print(f"  Location: {source_dir}")
    else:
        print("‚úó No datasets found in /kaggle/input/")
        print("\n" + "=" * 60)
        print("üí° HOW TO ADD YOUR DATA:")
        print("=" * 60)
        print("\nüìÅ STEP-BY-STEP INSTRUCTIONS:")
        print("\n1. Look for the '+ Add Data' button on the RIGHT side of this page")
        print("2. Click it and select 'Upload' tab")
        print("3. Upload your ZIP file containing:")
        print("   - train_all_models.py")
        print("   - data/processed/medical_dataset.csv (or just medical_dataset.csv)")
        print("   - utils/ folder")
        print("4. Click 'Add Dataset' button")
        print("5. Wait for upload to complete (may take a few minutes)")
        print("6. Re-run this cell")
        print("\n" + "=" * 60)
        print("\n‚è∏Ô∏è  PAUSED: Waiting for you to add data...")
        print("This is NOT an error - just follow the steps above!")
        print("=" * 60)
        raise SystemExit("Please add your dataset and re-run this cell.")
else:
    print("‚úó Not running in Kaggle environment")
    print("\nüí° This notebook is designed for Kaggle.")
    print("If you're running locally, you'll need to modify the file paths.")
    raise EnvironmentError("This notebook is designed for Kaggle. /kaggle/input/ not found.")

# Check if it's a ZIP file that needs extraction
zip_files = [f for f in os.listdir(source_dir) if f.endswith('.zip')]
if zip_files and not os.path.exists(os.path.join(source_dir, 'train_all_models.py')):
    print(f"\nüì¶ Found ZIP file: {zip_files[0]}")
    print("Extracting to working directory...")
    with zipfile.ZipFile(os.path.join(source_dir, zip_files[0]), 'r') as zip_ref:
        zip_ref.extractall('extracted')
    source_dir = 'extracted'
    print("‚úì Extraction complete")

# Find project root if files are in subdirectory
if not os.path.exists(os.path.join(source_dir, 'train_all_models.py')):
    print("\nSearching for project files in subdirectories...")
    found = False
    for root, dirs, files in os.walk(source_dir):
        if 'train_all_models.py' in files:
            source_dir = root
            print(f"‚úì Found project files in: {source_dir}")
            found = True
            break
    if not found:
        print("‚úó train_all_models.py not found")
        print("\nüìÇ Available files in dataset:")
        for root, dirs, files in os.walk(source_dir):
            for file in files[:10]:  # Show first 10 files
                print(f"  - {os.path.join(root, file)}")
        raise FileNotFoundError("train_all_models.py not found in dataset")

print(f"\n‚úì Source directory: {source_dir}")

if not os.path.exists(source_dir):
    raise FileNotFoundError(f"Source directory not found: {source_dir}")

# Copy project files to working directory
print("\nCopying files to working directory...")

# Copy train_all_models.py
if os.path.exists(os.path.join(source_dir, 'train_all_models.py')):
    shutil.copy2(os.path.join(source_dir, 'train_all_models.py'), 'train_all_models.py')
    print("‚úì Copied train_all_models.py")
else:
    print("‚úó train_all_models.py not found in source")

# Copy utils folder
if os.path.exists(os.path.join(source_dir, 'utils')):
    if os.path.exists('utils'):
        shutil.rmtree('utils')
    shutil.copytree(os.path.join(source_dir, 'utils'), 'utils')
    utils_files = len(os.listdir('utils'))
    print(f"‚úì Copied utils/ folder ({utils_files} files)")
else:
    print("‚úó utils/ folder not found in source")

# Handle medical_dataset.csv - check multiple possible locations
csv_found = False
possible_csv_paths = [
    os.path.join(source_dir, 'data', 'processed', 'medical_dataset.csv'),
    os.path.join(source_dir, 'medical_dataset.csv')
]

for csv_path in possible_csv_paths:
    if os.path.exists(csv_path):
        os.makedirs('data/processed', exist_ok=True)
        shutil.copy2(csv_path, 'data/processed/medical_dataset.csv')
        print(f"‚úì Copied medical_dataset.csv")
        csv_found = True
        break

if not csv_found:
    # Search for CSV file anywhere in source directory
    print("Searching for medical_dataset.csv...")
    for root, dirs, files in os.walk(source_dir):
        if 'medical_dataset.csv' in files:
            csv_path = os.path.join(root, 'medical_dataset.csv')
            os.makedirs('data/processed', exist_ok=True)
            shutil.copy2(csv_path, 'data/processed/medical_dataset.csv')
            print(f"‚úì Found and copied medical_dataset.csv")
            csv_found = True
            break
    
    if not csv_found:
        print("‚úó medical_dataset.csv not found in source")

# Verify copied files
print("\n" + "=" * 60)
print("FILE VERIFICATION")
print("=" * 60)

if os.path.exists('train_all_models.py'):
    size = os.path.getsize('train_all_models.py')
    print(f"‚úì train_all_models.py ({size:,} bytes)")
else:
    print("‚úó train_all_models.py - MISSING")

if os.path.exists('data/processed/medical_dataset.csv'):
    size = os.path.getsize('data/processed/medical_dataset.csv')
    print(f"‚úì medical_dataset.csv ({size:,} bytes)")
else:
    print("‚úó medical_dataset.csv - MISSING")

if os.path.exists('utils'):
    utils_files = len(os.listdir('utils'))
    print(f"‚úì utils/ folder ({utils_files} files)")
else:
    print("‚úó utils/ folder - MISSING")

print(f"\nWorking directory: {os.getcwd()}")
print("=" * 60)

## Step 3: Verify Dataset

Check that your dataset is ready for training.

In [None]:
import pandas as pd
import os

print("=" * 60)
print("DATASET VERIFICATION")
print("=" * 60)

dataset_path = 'data/processed/medical_dataset.csv'

if not os.path.exists(dataset_path):
    print(f"‚úó ERROR: Dataset not found at {dataset_path}")
    raise FileNotFoundError(f"Dataset not found: {dataset_path}")

try:
    df = pd.read_csv(dataset_path)
    print(f"‚úì Dataset loaded successfully")
    print(f"\nTotal samples: {len(df):,}")
    
    if len(df) == 0:
        print("\n‚úó ERROR: Dataset is empty")
        raise ValueError("Dataset is empty")
    
    print(f"\nColumns: {list(df.columns)}")
    
    print(f"\nLabel distribution:")
    label_counts = df['label'].value_counts()
    for label, count in label_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {label:12s}: {count:6,} ({percentage:5.1f}%)")
    
    print(f"\n‚úì Dataset is ready for training!")
    
except Exception as e:
    print(f"\n‚úó ERROR: {e}")
    raise

print("=" * 60)

## Step 4: Clean Previous Results

Delete any old training results to start fresh.

In [None]:
import shutil
import os

print("=" * 60)
print("CLEANING PREVIOUS TRAINING RESULTS")
print("=" * 60)

dirs_to_clean = ['models', 'results', 'logs']

for dir_path in dirs_to_clean:
    if os.path.exists(dir_path):
        try:
            shutil.rmtree(dir_path)
            print(f"‚úì Deleted: {dir_path}/")
        except Exception as e:
            print(f"‚úó Error deleting {dir_path}: {e}")
    else:
        print(f"  {dir_path}/ does not exist")

# Recreate fresh directory structure
os.makedirs('models/ml', exist_ok=True)
os.makedirs('models/dl', exist_ok=True)
os.makedirs('models/transformer', exist_ok=True)
os.makedirs('results/ml', exist_ok=True)
os.makedirs('results/dl', exist_ok=True)
os.makedirs('results/transformer', exist_ok=True)

print("\n‚úì All previous results deleted")
print("‚úì Fresh directories created")
print("=" * 60)

## Step 5: Train All Models

This will train all 5 models:
- **ML Models:** Logistic Regression, Random Forest
- **DL Models:** CNN, LSTM
- **Transformer:** BioBERT

**Training improvements applied:**
- Label smoothing (0.1) to reduce overconfidence
- Stronger regularization (L2, dropout, weight decay)
- Lower learning rates for better convergence
- Early stopping to prevent overfitting

‚è± **Expected time:** 1-3 hours depending on dataset size and GPU

You can monitor progress below. The training will show:
- Current model being trained
- Epoch progress
- Training and validation metrics
- Final test results for each model

In [None]:
import os

# Disable W&B (Weights & Biases) tracking to avoid interactive prompts
os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'disabled'

print("=" * 60)
print("STARTING MODEL TRAINING")
print("=" * 60)
print("Training 5 models:")
print("  1. Logistic Regression (ML)")
print("  2. Random Forest (ML)")
print("  3. CNN (Deep Learning)")
print("  4. LSTM (Deep Learning)")
print("  5. BioBERT (Transformer)")
print("\nThis will take 1-3 hours. Progress shown below...")
print("=" * 60 + "\n")

!python train_all_models.py

## Step 6: Verify Training Results

Check that all models and results were created successfully.

In [None]:
import os
import json

print("=" * 60)
print("TRAINING RESULTS VERIFICATION")
print("=" * 60)

# Check for trained models
expected_models = {
    'ML Models': [
        'models/ml/logistic_regression.pkl',
        'models/ml/random_forest.pkl',
        'models/ml/tfidf_vectorizer.pkl',
        'models/ml/label_encoder.pkl'
    ],
    'DL Models': [
        'models/dl/cnn_best.pt',
        'models/dl/lstm_best.pt',
        'models/dl/vocab.json'
    ],
    'Transformer': [
        'models/transformer/biobert_final/config.json',
        'models/transformer/biobert_final/tokenizer_config.json'
    ]
}

expected_results = {
    'ML Results': [
        'results/ml/logistic_regression_metrics.json',
        'results/ml/random_forest_metrics.json'
    ],
    'DL Results': [
        'results/dl/cnn_metrics.json',
        'results/dl/lstm_metrics.json'
    ],
    'Transformer Results': [
        'results/transformer/biobert_metrics.json'
    ]
}

all_good = True

print("\nüìÅ MODELS:")
for category, files in expected_models.items():
    print(f"\n{category}:")
    for file_path in files:
        if os.path.exists(file_path):
            size = os.path.getsize(file_path) / (1024 * 1024)  # MB
            print(f"  ‚úì {os.path.basename(file_path)} ({size:.2f} MB)")
        else:
            print(f"  ‚úó {os.path.basename(file_path)} - MISSING")
            all_good = False

# Check transformer model weights
transformer_dir = 'models/transformer/biobert_final'
if os.path.exists(transformer_dir):
    transformer_files = os.listdir(transformer_dir)
    model_file = next((f for f in transformer_files if f.endswith('.safetensors') or f.endswith('.bin')), None)
    if model_file:
        size = os.path.getsize(os.path.join(transformer_dir, model_file)) / (1024 * 1024)
        print(f"  ‚úì {model_file} ({size:.2f} MB)")
    else:
        print(f"  ‚úó BioBERT weights - MISSING")
        all_good = False

print("\nüìä RESULTS:")
for category, files in expected_results.items():
    print(f"\n{category}:")
    for file_path in files:
        if os.path.exists(file_path):
            print(f"  ‚úì {os.path.basename(file_path)}")
        else:
            print(f"  ‚úó {os.path.basename(file_path)} - MISSING")
            all_good = False

# Show summary metrics if available
summary_file = 'results/comprehensive_training_summary.json'
if os.path.exists(summary_file):
    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE SUMMARY")
    print("=" * 60)
    with open(summary_file, 'r') as f:
        summary = json.load(f)
    
    print(f"\nTotal models trained: {summary['project_info']['total_models']}")
    print("\nTest Set Performance:")
    for model_key, model_data in summary['models'].items():
        metrics = model_data['test_set_metrics']
        print(f"\n{model_data['model_name'].upper()}:")
        print(f"  Accuracy:  {metrics['accuracy']:.4f}")
        print(f"  F1-Macro:  {metrics['f1_macro']:.4f}")
        print(f"  Precision: {metrics['precision_macro']:.4f}")
        print(f"  Recall:    {metrics['recall_macro']:.4f}")

print("\n" + "=" * 60)
if all_good:
    print("‚úì ALL MODELS TRAINED SUCCESSFULLY!")
else:
    print("‚ö† WARNING: Some files are missing")
print("=" * 60)

## Step 7: Download Results

Package all trained models and results into a ZIP file and download directly. Progress updates keep the session alive during packaging to prevent timeout.

In [None]:
import os
import shutil
from datetime import datetime
import zipfile
import time
from IPython.display import FileLink, display

print("=" * 60)
print("PREPARING DOWNLOAD PACKAGE")
print("=" * 60)
print("\n‚è≥ Packaging files... (this keeps session alive)")

# Create temporary directory for packaging
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = f'Medical_Misinformation_Training_{timestamp}'
os.makedirs(output_dir, exist_ok=True)

files_copied = 0
last_update = time.time()

def print_progress(message):
    """Print progress to keep session alive"""
    print(f"  {message}")

# Copy models folder with progress
if os.path.exists('models'):
    print("\nüìÅ Copying models...")
    for root, dirs, files in os.walk('models'):
        for file in files:
            src = os.path.join(root, file)
            rel_path = os.path.relpath(src, 'models')
            dst = os.path.join(output_dir, 'models', rel_path)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(src, dst)
            files_copied += 1
            # Print every 5 files to show activity
            if files_copied % 5 == 0:
                print_progress(f"Copied {files_copied} files...")
    print(f"  ‚úì Copied models/ ({files_copied} files)")

# Copy results folder with progress
if os.path.exists('results'):
    print("\nüìä Copying results...")
    result_start = files_copied
    for root, dirs, files in os.walk('results'):
        for file in files:
            src = os.path.join(root, file)
            rel_path = os.path.relpath(src, 'results')
            dst = os.path.join(output_dir, 'results', rel_path)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(src, dst)
            files_copied += 1
    result_count = files_copied - result_start
    print(f"  ‚úì Copied results/ ({result_count} files)")

# Copy dataset
if os.path.exists('data/processed/medical_dataset.csv'):
    print("\nüìÑ Copying dataset...")
    os.makedirs(os.path.join(output_dir, 'data'), exist_ok=True)
    shutil.copy2('data/processed/medical_dataset.csv', 
                 os.path.join(output_dir, 'data', 'medical_dataset.csv'))
    files_copied += 1
    print(f"  ‚úì Copied dataset")

# Create README
readme_content = f"""# Medical Misinformation Detection - Training Results
Training completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Contents:

### Models ({files_copied} files total)
- models/ml/ - Logistic Regression & Random Forest
- models/dl/ - CNN & LSTM (PyTorch)
- models/transformer/ - BioBERT

### Results
- results/ml/ - ML model metrics and confusion matrices
- results/dl/ - DL model metrics, confusion matrices, and training curves
- results/transformer/ - BioBERT metrics and confusion matrix
- results/comprehensive_training_summary.json - Complete summary

### Data
- data/medical_dataset.csv - Dataset used for training

## How to Use:
1. Extract this ZIP file on your local machine
2. Review metrics in results/ folder
3. Load models in your application:
   - ML models: joblib.load()
   - DL models: torch.load()
   - Transformer: AutoModel.from_pretrained()
"""

readme_path = os.path.join(output_dir, 'README.txt')
with open(readme_path, 'w') as f:
    f.write(readme_content)

# Create ZIP file with progress updates
print("\nüì¶ Creating ZIP file...")
print("  (Printing progress to keep session alive)")
zip_filename = f'{output_dir}.zip'

files_to_zip = []
for root, dirs, files in os.walk(output_dir):
    for file in files:
        files_to_zip.append(os.path.join(root, file))

total_files = len(files_to_zip)
print(f"  Compressing {total_files} files...")

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for idx, file_path in enumerate(files_to_zip, 1):
        arcname = os.path.relpath(file_path, os.path.dirname(output_dir))
        zipf.write(file_path, arcname)
        # Print progress every 10 files to keep session alive
        if idx % 10 == 0 or idx == total_files:
            print(f"  Progress: {idx}/{total_files} files compressed...")

zip_size = os.path.getsize(zip_filename) / (1024 * 1024)  # MB
print(f"\n‚úì ZIP file created: {zip_filename}")
print(f"  Size: {zip_size:.2f} MB")
print(f"  Total files: {files_copied}")

# Clean up temporary directory
shutil.rmtree(output_dir)

print("\n" + "=" * 60)
print("‚úì PACKAGE READY FOR DOWNLOAD!")
print("=" * 60)

print("\nüì• DOWNLOADING...")
print("  Click the link below to download:")
print("  (If download doesn't start, right-click ‚Üí Save Link As)")
print()

# Display download link
display(FileLink(zip_filename))

print("\n" + "=" * 60)
print("‚úì Download link displayed above")
print("  The file will also appear in the Output tab")
print("=" * 60)

## üéâ Training Complete!

Your models have been trained and packaged for download.

**ZIP file:** `Medical_Misinformation_Training_[timestamp].zip`

### Contents:

**Models:**
- `models/ml/` - Logistic Regression & Random Forest
- `models/dl/` - CNN & LSTM
- `models/transformer/` - BioBERT

**Results:**
- `results/ml/` - ML model metrics and confusion matrices
- `results/dl/` - DL model metrics, confusion matrices, and training curves
- `results/transformer/` - BioBERT metrics and confusion matrix
- `results/comprehensive_training_summary.json` - Complete summary

**Data:**
- `data/medical_dataset.csv` - Dataset used for training

### Download Instructions:

1. **Click the download link in the output above** (Step 7 cell output)
2. If link doesn't work, **right-click ‚Üí Save Link As...**
3. **Alternative:** Check the **Output** tab (top-right) for the ZIP file

### Next Steps:
1. Extract the ZIP file on your local machine
2. Review metrics in the `results/` folder
3. Use the trained models for predictions in your application

---

**Need to retrain?** Re-run from Step 5 (training cell).