<a href="https://colab.research.google.com/github/rmkenv/newsletter/blob/main/substack2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas numpy scikit-learn matplotlib



In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Create more realistic energy consumption data
np.random.seed(42)
n_samples = 1000

# Generate features
temperature = np.random.normal(20, 8, n_samples)
humidity = np.random.normal(50, 15, n_samples)
hour_of_day = np.random.randint(0, 24, n_samples)
day_of_week = np.random.randint(0, 7, n_samples)

# Create realistic energy consumption with non-linear relationships
base_load = 50
temp_effect = np.where(temperature < 18, 2*(18-temperature),
                      np.where(temperature > 22, 3*(temperature-22), 0))
hour_effect = 20 * np.sin((hour_of_day - 6) * np.pi / 12) * (hour_of_day > 6) * (hour_of_day < 20)
weekend_effect = -15 * (day_of_week >= 5)

energy_consumption = (base_load + temp_effect + hour_effect +
                     weekend_effect + np.random.normal(0, 5, n_samples))

# Create DataFrame
df = pd.DataFrame({
    'temperature': temperature,
    'humidity': humidity,
    'hour_of_day': hour_of_day,
    'day_of_week': day_of_week,
    'energy_consumption': energy_consumption
})

# Prepare data
X = df[['temperature', 'humidity', 'hour_of_day', 'day_of_week']]
y = df['energy_consumption']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale data for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

# Compare model performance
results = []

for name, model in models.items():
    if name == 'Neural Network':
        # Use scaled data for neural network
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    results.append({
        'Model': name,
        'MAE': mae,
        'R² Score': r2
    })

# Display results
results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print(results_df)

# Visualize model interpretability vs accuracy trade-off
plt.figure(figsize=(12, 8))

# Plot 1: Performance comparison
plt.subplot(2, 2, 1)
plt.bar(results_df['Model'], results_df['MAE'])
plt.title('Mean Absolute Error (Lower is Better)')
plt.xticks(rotation=45)
plt.ylabel('MAE')

plt.subplot(2, 2, 2)
plt.bar(results_df['Model'], results_df['R² Score'])
plt.title('R² Score (Higher is Better)')
plt.xticks(rotation=45)
plt.ylabel('R² Score')

# Plot 3: Feature importance for Random Forest
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.subplot(2, 2, 3)
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')

# Plot 4: Actual vs Predicted for best model
best_model_name = results_df.loc[results_df['MAE'].idxmin(), 'Model']
best_model = models[best_model_name]

if best_model_name == 'Neural Network':
    best_predictions = best_model.predict(X_test_scaled)
else:
    best_predictions = best_model.predict(X_test)

plt.subplot(2, 2, 4)
plt.scatter(y_test, best_predictions, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Energy Consumption')
plt.ylabel('Predicted Energy Consumption')
plt.title(f'Best Model: {best_model_name}')

plt.tight_layout()
plt.show()

# Algorithm selection guide based on your results
print("\nAlgorithm Selection Guide:")
print("=" * 40)

if results_df.loc[results_df['Model'] == 'Linear Regression', 'R² Score'].iloc[0] > 0.8:
    print("✅ Linear Regression performs well - your relationships are mostly linear!")
    print("   → Use for: Regulatory reporting, interpretable models")
else:
    print("❌ Linear Regression struggles - you have non-linear relationships")
    print("   → Consider more complex models")

rf_score = results_df.loc[results_df['Model'] == 'Random Forest', 'R² Score'].iloc[0]
if rf_score == results_df['R² Score'].max():
    print("🌟 Random Forest is your best bet!")
    print("   → Use for: Feature importance, robust predictions, mixed data types")

nn_score = results_df.loc[results_df['Model'] == 'Neural Network', 'R² Score'].iloc[0]
if nn_score == results_df['R² Score'].max():
    print("🧠 Neural Network wins - you have complex patterns!")
    print("   → Use for: Complex non-linear relationships, large datasets")
