In [3]:
import warnings
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 读取CSV文件
df = pd.read_csv("./lianjia_data.csv")

# 忽略未来版本影响
warnings.filterwarnings('ignore')

# 数据预处理
# 将“价格”列去除单位和逗号，并转换为float
df["price"] = df["price"].str.replace("元/平", "").str.replace(",", "").astype(float)

# 将“面积”列去除“平米”单位，并转换为float
df["area"] = df["area"].str.replace("平米", "").astype(float)

# 从“建成年份”中提取年份（格式为“xxxx年”）
df["build_year"] = df["build_time"].str.extract(r"(\d{4})").astype(float)

# 从“户型”中提取房间数和厅数，例如“2室1厅”
df["rooms"] = df["house_types"].str.extract(r"(\d+)室").astype(float)
df["halls"] = df["house_types"].str.extract(r"(\d+)厅").astype(float)

# 对缺失的建成年份用中位数填充（防止模型出错）
df["build_year"].fillna(df["build_year"].median(), inplace=True)

# 1. 构建特征集和目标值
# 选择的特征有：面积、房间数、厅数、总价、建成年份
features = ["area", "rooms", "halls", "total_price", "build_year"]
df_model = df[features + ["price"]].dropna()  # 去除空值

X = df_model[features]  # 特征数据
y = df_model["price"]   # 目标值（每平米房价）

# 2. 数据划分为训练集和测试集（80%训练，20%测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 创建并训练XGBoost模型
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)


# 4. 模型预测与评估
y_pred = model.predict(X_test)

# 5. 计算误差指标
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # 自己开方算 RMSE

print(f"平均绝对误差 MAE: {mae:.2f}")
print(f"均方根误差 RMSE: {rmse:.2f}")

平均绝对误差 MAE: 897.33
均方根误差 RMSE: 1658.02
