In [16]:
import pandas as pd

# 讀取 Excel 檔案
file_path = r".\公共設施權重.xlsx"
data = pd.read_excel(file_path)

# 檢查缺失值
missing_values = data.isnull().sum()

# 檢查數據類型以進行分類變量的編碼
data_types = data.dtypes

missing_values, data_types

# 選擇特徵和目標變量
features = data[
    [
        "醫院總數",
        "學校總數",
        "電影院總數",
        "運動設施總數",
        "捷運站數",
        "夜市總數",
        "火車總數",
        "公園總數",
        "公車站總數",
    ]
]
target = data["評論"]

# 將數據集拆分為訓練集和測試集
from sklearn.model_selection import train_test_split

# 將數據拆分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# 檢查訓練集和測試集的形狀
X_train.shape, X_test.shape, y_train.shape, y_test.shape

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 初始化模型
model = LinearRegression()

# 訓練模型
model.fit(X_train, y_train)

# 對訓練集和測試集進行預測
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 計算並打印性能指標
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("train_mse=",train_mse)
print("test_mse=",test_mse)
# MSE 的值越小越好，表示模型在測試集上的預測表現越好。
print("train_r2=",train_r2)
print("test_r2=",test_r2)
#-squared 的值越接近1越好，表示模型能預測準確。

train_mse= 18192224.142170317
test_mse= 15411401.753598796
train_r2= 0.9283725871333269
test_r2= 0.8513741955439527


In [14]:
# Show model coefficients
coefficients = model.coef_
intercept = model.intercept_

print("模型係數:")
for feature, coef in zip(features.columns, coefficients):
    print(f"{feature}: {coef:.4f}")

print(f"Intercept: {intercept:.4f}")

模型係數:
醫院總數: -841.0117
學校總數: 271.3277
電影院總數: 2002.2567
運動設施總數: 435.8057
捷運站數: 974.4097
夜市總數: 996.7070
火車總數: -270.0184
公園總數: -16.6129
公車站總數: 15.5534
Intercept: -278.4971


In [8]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate R2 score and RMSE for train and test sets
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f"R2_score (train): {train_r2}")
print(f"R2_score (test): {test_r2}")
print(f"RMSE_score (train): {train_rmse}")
print(f"RMSE_score (test): {test_rmse}")

# Display summary of OLS regression results
import statsmodels.api as sm

R2_score (train): 0.9197039384320779
R2_score (test): 0.9468091420122076
RMSE_score (train): 4334.1669403537135
RMSE_score (test): 3252.9543363589796




In [9]:
X_train_with_const = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_with_const).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                     評論   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.887
Method:                 Least Squares   F-statistic:                     28.00
Date:                Sun, 25 Feb 2024   Prob (F-statistic):           4.94e-10
Time:                        21:58:28   Log-Likelihood:                -313.38
No. Observations:                  32   AIC:                             646.8
Df Residuals:                      22   BIC:                             661.4
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -622.4535   2066.087     -0.301      0.7