In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
import statsmodels.api as sm  # 这就是我们要用的软件包
from sklearn.metrics import mean_squared_error, r2_score # 用来计算测试集指标

# --- 1. 加载数据 ---
# (与之前相同)
concrete_compressive_strength = fetch_ucirepo(id=165)
X_df = concrete_compressive_strength.data.features
y_df = concrete_compressive_strength.data.targets

# --- 2. 严格按要求拆分数据 ---
# (与之前相同)
test_indices = list(range(500, 630))
train_indices = [i for i in range(len(X_df)) if i not in test_indices]

# 我们使用原始的 DataFrames
X_train_raw_df = X_df.iloc[train_indices]
y_train_raw_df = y_df.iloc[train_indices]

X_test_raw_df = X_df.iloc[test_indices]
y_test_raw_df = y_df.iloc[test_indices]

print(f"训练集大小: {len(X_train_raw_df)} 样本")
print(f"测试集大小: {len(X_test_raw_df)} 样本")

# --- 3. 准备数据 (Part B - Set 2: Raw/Raw) ---
# "Use the raw predictor values, and raw response values."

# **重要**: statsmodels 需要我们手动添加一个 'const' (截距 b) 列
X_train_with_const = sm.add_constant(X_train_raw_df)
X_test_with_const = sm.add_constant(X_test_raw_df)

# y 保持不变
y_train = y_train_raw_df

# --- 4. 拟合多变量模型 ---
# "Using a regression analysis function from a software package"
# 我们使用 OLS (Ordinary Least Squares - 普通最小二乘法)
model = sm.OLS(y_train, X_train_with_const).fit()

# --- 5. 获取结果 ---

# 打印完整的统计摘要 (这会显示 p-values，用于 PDF 的 Part B)
print(model.summary())

# --- 5a. 计算训练集指标 ---
# "What MSE value does this model achieve on the training data?"
y_train_pred = model.predict(X_train_with_const)
mse_train = mean_squared_error(y_train, y_train_pred)

# "What Variance Explained (or R-squared) value... on the training data?"
# 我们可以直接从摘要中获取训练集的 R-squared
r2_train = model.rsquared

# --- 5b. 计算测试集指标 ---
# "On the testing data?" (MSE)
y_test_pred = model.predict(X_test_with_const)
mse_test = mean_squared_error(y_test_raw_df, y_test_pred)

# "On the testing data?" (R-Squared)
# 我们需要使用 sklearn 来计算测试集的 R-squared
r2_test = r2_score(y_test_raw_df, y_test_pred)

# --- 6. 打印您需要填写的 4 个值 ---
print("\n" + "="*50)
print("Q1.1 Performance (Raw/Raw Model) 结果:")
print("="*50)

print(f"What MSE value does this model achieve on the training data?\n{mse_train}")
print(f"On the testing data?\n{mse_test}")
print(f"What Variance Explained (or R-squared) value does this model achieve on the training data?\n{r2_train}")
print(f"On the testing data?\n{r2_test}")

训练集大小: 900 样本
测试集大小: 130 样本
                                  OLS Regression Results                                 
Dep. Variable:     Concrete compressive strength   R-squared:                       0.624
Model:                                       OLS   Adj. R-squared:                  0.621
Method:                            Least Squares   F-statistic:                     184.9
Date:                           Fri, 17 Oct 2025   Prob (F-statistic):          1.84e-183
Time:                                   23:57:40   Log-Likelihood:                -3367.6
No. Observations:                            900   AIC:                             6753.
Df Residuals:                                891   BIC:                             6796.
Df Model:                                      8                                         
Covariance Type:                       nonrobust                                         
                         coef    std err          t      P>|t|      [0.0