# 02. Baseline Models Implementation

## Objective
Implement baseline models to establish a performance floor and demonstrate that ML models add value over simple heuristics.

## Baseline Strategies
1. **Random Guess**: 33.3% accuracy (3 classes)
2. **Majority Class**: Predict most common class
3. **Persistence Model**: Predict that future trajectory = current trajectory
4. **Simple Rule-Based**: Use simple thresholds on key features


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported")


## 1. Load Data


In [None]:
# Load the dataset
df = pd.read_csv('../today/trajectory_ml_ready_advanced.csv')

# Prepare features and target
drop_cols = ['UNITID', 'Institution_Name', 'Year', 'Target_Trajectory', 'Target_Label', 'State']
X = df.drop(columns=drop_cols)
y = df['Target_Label'].astype(int)

# One-hot encode Division
X = pd.get_dummies(X, columns=['Division'], drop_first=True)

# Split data (using same random state as original model for comparison)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Training Set: {X_train.shape[0]} samples")
print(f"Test Set: {X_test.shape[0]} samples")
print(f"\nClass Distribution (Train):")
print(y_train.value_counts().sort_index())
print(f"\nClass Distribution (Test):")
print(y_test.value_counts().sort_index())


## 2. Baseline 1: Random Guess


In [None]:
# Random guess: predict each class with equal probability
np.random.seed(42)
y_pred_random = np.random.choice([0, 1, 2], size=len(y_test))

accuracy_random = accuracy_score(y_test, y_pred_random)
print(f"Random Guess Accuracy: {accuracy_random:.4f} ({accuracy_random*100:.2f}%)")
print(f"Expected: ~33.33% for 3 classes")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_random, 
                          target_names=['Declining', 'Stable', 'Improving']))


## 3. Baseline 2: Majority Class


In [None]:
# Predict the most common class in training data
majority_class = y_train.mode()[0]
y_pred_majority = np.full(len(y_test), majority_class)

accuracy_majority = accuracy_score(y_test, y_pred_majority)
print(f"Majority Class: {majority_class} ({'Declining' if majority_class == 0 else 'Stable' if majority_class == 1 else 'Improving'})")
print(f"Majority Class Accuracy: {accuracy_majority:.4f} ({accuracy_majority*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_majority, 
                          target_names=['Declining', 'Stable', 'Improving']))


## 4. Baseline 3: Persistence Model


In [None]:
# Persistence: predict that future trajectory = current trajectory
# This requires knowing the current trajectory for each test sample

# Load full dataset to get current trajectory
df_full = pd.read_csv('../today/trajectory_ml_ready_advanced.csv')
df_full['Target_Label'] = df_full['Target_Label'].astype(int)

# For each test sample, find its current trajectory
# We need to match by UNITID and Year
test_indices = y_test.index
y_pred_persistence = []

for idx in test_indices:
    # Get the row from full dataset
    row = df_full.iloc[idx]
    unitid = row['UNITID']
    year = row['Year']
    
    # Find the same institution in previous year
    prev_year_data = df_full[(df_full['UNITID'] == unitid) & (df_full['Year'] == year - 1)]
    
    if not prev_year_data.empty:
        # Use previous year's trajectory as prediction
        prev_trajectory = prev_year_data.iloc[0]['Target_Label']
        y_pred_persistence.append(prev_trajectory)
    else:
        # If no previous year, use majority class
        y_pred_persistence.append(majority_class)

y_pred_persistence = np.array(y_pred_persistence)

accuracy_persistence = accuracy_score(y_test, y_pred_persistence)
print(f"Persistence Model Accuracy: {accuracy_persistence:.4f} ({accuracy_persistence*100:.2f}%)")
print("(Predicts that future trajectory = current trajectory)")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_persistence, 
                          target_names=['Declining', 'Stable', 'Improving']))


## 5. Baseline 4: Simple Rule-Based Model


In [None]:
# Simple rule: Use key features to make predictions
# Rules based on domain knowledge:
# - If revenue growth is high and efficiency is good ‚Üí Improving
# - If revenue is declining or expenses growing fast ‚Üí Declining
# - Otherwise ‚Üí Stable

def simple_rule_predict(row):
    """Simple rule-based prediction"""
    # Get key features
    revenue_growth = row.get('Revenue_Growth_1yr', 0)
    expense_growth = row.get('Expense_Growth_1yr', 0)
    efficiency = row.get('Efficiency_Mean_2yr', 1.0)
    
    # Rule 1: Improving - High revenue growth and good efficiency
    if revenue_growth > 0.05 and efficiency > 1.0:
        return 2  # Improving
    
    # Rule 2: Declining - Negative revenue growth OR expenses growing much faster
    if revenue_growth < -0.02 or (expense_growth - revenue_growth) > 0.05:
        return 0  # Declining
    
    # Rule 3: Default to Stable
    return 1  # Stable

# Apply rules to test set
y_pred_rules = X_test.apply(simple_rule_predict, axis=1).values

accuracy_rules = accuracy_score(y_test, y_pred_rules)
print(f"Simple Rule-Based Accuracy: {accuracy_rules:.4f} ({accuracy_rules*100:.2f}%)")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rules, 
                          target_names=['Declining', 'Stable', 'Improving']))


## 6. Compare All Baselines


In [None]:
# Create comparison table
baseline_results = {
    'Baseline': ['Random Guess', 'Majority Class', 'Persistence', 'Simple Rules'],
    'Accuracy': [accuracy_random, accuracy_majority, accuracy_persistence, accuracy_rules],
    'Predictions': [y_pred_random, y_pred_majority, y_pred_persistence, y_pred_rules]
}

baseline_df = pd.DataFrame({
    'Baseline': baseline_results['Baseline'],
    'Accuracy': baseline_results['Accuracy']
})

print("=" * 60)
print("BASELINE MODELS COMPARISON")
print("=" * 60)
print(baseline_df.to_string(index=False))

# Find best baseline
best_baseline_idx = baseline_df['Accuracy'].idxmax()
best_baseline = baseline_df.iloc[best_baseline_idx]
print(f"\n‚úÖ Best Baseline: {best_baseline['Baseline']} with {best_baseline['Accuracy']:.4f} accuracy")

# Visualize
plt.figure(figsize=(10, 6))
bars = plt.bar(baseline_df['Baseline'], baseline_df['Accuracy'], 
               color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12'])
plt.axhline(y=0.333, color='r', linestyle='--', label='Random (33.3%)')
plt.ylabel('Accuracy')
plt.title('Baseline Models Performance Comparison')
plt.ylim(0, 1)
plt.legend()
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('baseline_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Chart saved as 'baseline_comparison.png'")


## 7. Compare with ML Model Performance


In [None]:
# Load the saved ML model to compare
import joblib

try:
    ml_model = joblib.load('../today/models/final_trajectory_model.joblib')
    
    # Prepare test data for ML model (same preprocessing)
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    
    # Get predictions from ML model
    y_pred_ml = ml_model.predict(X_test)
    y_prob_ml = ml_model.predict_proba(X_test)
    
    accuracy_ml = accuracy_score(y_test, y_pred_ml)
    
    # Calculate ROC-AUC
    try:
        roc_auc_ml = roc_auc_score(y_test, y_prob_ml, multi_class='ovr')
    except:
        roc_auc_ml = None
    
    print("=" * 60)
    print("ML MODEL vs BASELINES")
    print("=" * 60)
    
    comparison = pd.DataFrame({
        'Model': ['Random Guess', 'Majority Class', 'Persistence', 'Simple Rules', 'XGBoost (ML)'],
        'Accuracy': [accuracy_random, accuracy_majority, accuracy_persistence, accuracy_rules, accuracy_ml]
    })
    
    if roc_auc_ml:
        comparison['ROC-AUC'] = [None, None, None, None, roc_auc_ml]
    
    print(comparison.to_string(index=False))
    
    # Calculate improvement over best baseline
    best_baseline_acc = max(accuracy_random, accuracy_majority, accuracy_persistence, accuracy_rules)
    improvement = accuracy_ml - best_baseline_acc
    improvement_pct = (improvement / best_baseline_acc) * 100
    
    print(f"\nüìà ML Model Improvement:")
    print(f"   Best Baseline: {best_baseline_acc:.4f}")
    print(f"   ML Model: {accuracy_ml:.4f}")
    print(f"   Improvement: +{improvement:.4f} ({improvement_pct:+.2f}%)")
    
    if roc_auc_ml:
        print(f"   ROC-AUC: {roc_auc_ml:.4f}")
    
    # Visualize comparison
    plt.figure(figsize=(12, 6))
    models = comparison['Model'].values
    accuracies = comparison['Accuracy'].values
    
    bars = plt.bar(models, accuracies, color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6'])
    plt.axhline(y=0.333, color='r', linestyle='--', alpha=0.5, label='Random (33.3%)')
    plt.ylabel('Accuracy')
    plt.title('ML Model vs Baseline Models')
    plt.ylim(0, 1)
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.savefig('ml_vs_baselines.png', dpi=150, bbox_inches='tight')
    plt.show()
    
except FileNotFoundError:
    print("‚ö†Ô∏è ML model file not found. Skipping ML comparison.")
    print("   Run this after the model has been trained and saved.")
