In [None]:
# notebooks/02_Baseline_Models.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split
from src.preprocessing import run_full_preprocessing_pipeline, create_tfidf_features
from src.traditional_models import train_and_evaluate_baselines
import matplotlib.pyplot as plt
import seaborn as sns

# --- Configuration ---
TCGA_REPORTS_PATH = '../data/raw/TCGA_Reports.csv' 
TCGA_LABELS_PATH = '../data/raw/tcga_patient_to_cancer_type.csv' 
TEXT_COL = 'Report_Text' # Placeholder: Adjust to the actual column name in your CSV
LABEL_COL = 'Cancer_Type' # Placeholder: Adjust to the actual column name in your CSV

# --- 1. Data Loading and Merging (TCGA) ---
print("1. Loading TCGA Data...")
reports_df = pd.read_csv(TCGA_REPORTS_PATH)
labels_df = pd.read_csv(TCGA_LABELS_PATH)

# Assume your reports_df has a patient identifier, e.g., 'Patient_ID', which links to the labels_df
# The report mentioned handling a UUID pattern, which implies a merge is needed
# TCGA_Dataset = reports_df.merge(labels_df, on='Patient_ID', how='inner').dropna(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)
# Placeholder: Use a mock merge for structure until actual files are loaded
TCGA_Dataset = pd.DataFrame({
    'Report_Text': reports_df[TEXT_COL],
    'Cancer_Type': labels_df[LABEL_COL]
}).dropna().reset_index(drop=True)

print(f"Final TCGA dataset size: {len(TCGA_Dataset)}")

# --- 2. Preprocessing and Feature Extraction ---
print("\n2. Running Preprocessing Pipeline...")
TCGA_processed = run_full_preprocessing_pipeline(TCGA_Dataset, text_column=TEXT_COL)

X = TCGA_processed['processed_text']
y = TCGA_processed[LABEL_COL]

# Check for class imbalance (essential step reported in progress)
plt.figure(figsize=(12, 5))
sns.countplot(y=y)
plt.title('TCGA Class Distribution (Imbalanced)')
plt.show()

# Stratified 80/20 train-test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature Extraction (TF-IDF with max_features=3000, ngram_range=(1, 2))
X_train_vec, X_test_vec, tfidf_vectorizer = create_tfidf_features(X_train, X_test)
print(f"TF-IDF Feature Count: {X_train_vec.shape[1]}")

# --- 3. Model Training and Evaluation ---
results_df = train_and_evaluate_baselines(X_train_vec, X_test_vec, y_train, y_test)

print("\n--- Summary of TCGA Baseline Results (Target Match: SVM F1 ~0.83) ---")
print(results_df[['Model', 'Accuracy', 'F1']].to_markdown(index=False))

# --- 4. Visualization (for Final Report) ---
# Select the best model (SVM, as reported)
svm_model = train_and_evaluate_baselines(X_train_vec, X_test_vec, y_train, y_test).iloc[1] 
cm = svm_model['Confusion Matrix']
class_labels = sorted(y_test.unique())

plt.figure(figsize=(15, 12))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix - SVM on TCGA (Baseline Benchmark)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()