## 1. Setup & Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Setup complete")

## 2. Load Processed Data

In [None]:
# Load from previous notebook
df_raw = pd.read_csv('data/processed/behavioral_features_raw.csv')
df_standardized = pd.read_csv('data/processed/behavioral_features_standardized.csv')

X_train = pd.read_csv('data/processed/X_train_standardized.csv')
X_test = pd.read_csv('data/processed/X_test_standardized.csv')
y_train = pd.read_csv('data/processed/y_train.csv', squeeze=True)
y_test = pd.read_csv('data/processed/y_test.csv', squeeze=True)

print(f"Loaded training data: {X_train.shape}")
print(f"Loaded test data: {X_test.shape}")
print(f"\nTarget variable statistics (train):")
print(f"  Mean: {y_train.mean():.3f}")
print(f"  SD: {y_train.std():.3f}")
print(f"  Range: {y_train.min():.3f} - {y_train.max():.3f}")

## 3. Multiple Linear Regression (OLS) with Full Diagnostics

# Create coefficient summary table
coef_summary = pd.DataFrame({
    'Feature': ols_model.params.index,
    'Coefficient': ols_model.params.values,
    'Std Error': ols_model.bse.values,
    't-statistic': ols_model.tvalues.values,
    'p-value': ols_model.pvalues.values,
    '95% CI Lower': ols_model.conf_int()[0].values,
    '95% CI Upper': ols_model.conf_int()[1].values
})

coef_summary['Significant'] = coef_summary['p-value'].apply(lambda x: '***' if x < 0.001 else ('**' if x < 0.01 else ('*' if x < 0.05 else '')))

print("\nTable 1: OLS Regression Coefficients")
print("="*120)
print(coef_summary.to_string(index=False))
print("\nSignificance codes: *** p<0.001, ** p<0.01, * p<0.05")

# Save
coef_summary.to_csv('behavioral_analysis/results/04_ols_coefficients.csv', index=False)
print(f"\n✓ Saved: behavioral_analysis/results/04_ols_coefficients.csv")

# OLS performance
r2_ols_test = r2_score(y_test, y_pred_test)
rmse_ols = np.sqrt(mean_squared_error(y_test, y_pred_test))
mae_ols = mean_absolute_error(y_test, y_pred_test)

# Ridge performance
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge_test))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge_test)
r2_ridge = r2_score(y_test, y_pred_ridge_test)

# Lasso performance
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso_test))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso_test)
r2_lasso = r2_score(y_test, y_pred_lasso_test)

# Comparison table
comparison = pd.DataFrame({
    'Model': ['OLS', 'Ridge', 'Lasso'],
    'R² (Test)': [r2_ols_test, r2_ridge, r2_lasso],
    'RMSE': [rmse_ols, rmse_ridge, rmse_lasso],
    'MAE': [mae_ols, mae_ridge, mae_lasso]
})

print("\nTable 3: Model Performance Comparison")
print("="*80)
print(comparison.to_string(index=False))

comparison.to_csv('behavioral_analysis/results/05_model_comparison.csv', index=False)
print(f"\n✓ Saved: behavioral_analysis/results/05_model_comparison.csv")