In [21]:
import pandas as pd

df = pd.read_csv(
    "course-5-boston.csv"
)
df.head()

# 作为参数数据，即从这几个数据预测目标数据
features = df[["crim", "rm", "lstat"]]
target = df[["medv"]]  # 目标值数据
features.describe()

Unnamed: 0,crim,rm,lstat
count,506.0,506.0,506.0
mean,3.593761,6.284634,12.653063
std,8.596783,0.702617,7.141062
min,0.00632,3.561,1.73
25%,0.082045,5.8855,6.95
50%,0.25651,6.2085,11.36
75%,3.647423,6.6235,16.955
max,88.9762,8.78,37.97


In [25]:
target = df["medv"]  # 目标值数据
target.describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: medv, dtype: float64

In [26]:
split_num = int(len(features) * 0.7)  # 得到 70% 位置

# :x === 0 ~ x slice
X_train = features[:split_num]  # 训练集特征
y_train = target[:split_num]  # 训练集目标

# x: === x ~ end slice
X_test = features[split_num:]  # 测试集特征
y_test = target[split_num:]  # 测试集目标

X_train.shape,y_train.shape, X_test.shape, y_test.shape

((354, 3), (354,), (152, 3), (152,))

In [27]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()
model.fit(X_train, y_train)

# 得到模型拟合参数
model.intercept_, model.coef_

(-38.000969889690325, array([ 0.69979497, 10.13564218, -0.20532653]))

In [28]:
preds = model.predict(X_test)  # 输入测试集特征进行预测
preds  # 预测结果

array([17.77439141, 21.09512448, 27.63412265, 26.78577951, 25.38313368,
       24.3286313 , 28.4257879 , 25.12834727, 16.82806601, 20.76498858,
       52.3350748 , -0.18169806, 12.01475786,  7.87878077, 15.13155699,
       32.93748235, 37.07872049, 29.50613719, 25.50800832, 12.35867972,
        9.08901644, 47.08374238, 35.31759193, 33.3738765 , 38.34913316,
       33.10414639, 91.3556125 , 35.11735022, 19.69326952, 18.49805269,
       14.03767555, 20.9235166 , 20.41406182, 21.92218226, 15.20451678,
       18.05362998, 21.26289453, 23.18192502, 15.87149504, 27.70381826,
       27.65958772, 30.17151829, 27.04987446, 21.52730227, 37.82614512,
       22.09872387, 34.71166346, 32.07959454, 29.45253042, 29.51137956,
       41.49935191, 62.4121152 , 13.64508882, 24.71242033, 18.69151684,
       37.4909413 , 54.05864658, 34.94758034, 15.01355249, 30.17849355,
       32.22191275, 33.90252834, 33.02530285, 28.4416789 , 69.60201087,
       34.7617152 , 31.65353442, 24.5644437 , 24.78130285, 24.00

In [33]:
import numpy as np

# 平均绝对误差（MAE）就是绝对误差的平均值：测试集预期结果 - 预测结果的差的绝对值的总和平均值
def mae_solver(y_true: np.ndarray, y_pred: np.ndarray):
    """MAE 求解"""
    n = len(y_true)
    mae = sum(np.abs(y_true - y_pred)) / n
    return mae

def mse_solver(y_true: np.ndarray, y_pred: np.ndarray):
    """mse 求解"""
    n = len(y_true)
    mse = sum(np.square(y_true - y_pred)) / n
    return mse
    
mae = mae_solver(y_test.values, preds)
mse = mse_solver(y_test.values, preds)

print("MAE: ", mae)
print("MSE: ", mse)

MAE:  13.022063072780304
MSE:  303.83312472236486


In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 库中现成的函数
mae_ = mean_absolute_error(y_test, preds)
mse_ = mean_squared_error(y_test, preds)

print("scikit-learn MAE: ", mae_)
print("scikit-learn MSE: ", mse_)

scikit-learn MAE:  13.022063072780306
scikit-learn MSE:  303.83312472236486


In [None]:
# 全部目标值的平均值为 22 左右
# 而拟合的参数的误差在13左右，误差已经非常大了

# 没有针对数据进行预处理，且随机选择了 3 个特征，并没有合理利用数据集提供的其他特征
# 线性回归是通过线性关系去反映出数据的规律，但实际上房价并非简单的线性关系能够表征的