In [17]:
import numpy  as np
import pandas as pd
from sklearn.ensemble      import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model  import LinearRegression, Ridge, Lasso
from sklearn.metrics       import mean_squared_error
from xgboost               import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# D드라이브의 SEOUL 폴더에 있는 Housing.xlsx 파일 읽기
file_path = "D:/SEOUL/Housing.xlsx"

# 첫 번째 시트 데이터 읽어서 housing에 저장
housing = pd.read_excel(file_path, sheet_name = 0)

# 데이터 확인
print(housing.head())

   id  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0   1  7420         4          2        3      yes        no       no   
1   2  8960         4          4        4      yes        no       no   
2   3  9960         3          2        2      yes        no      yes   
3   4  7500         4          2        2      yes        no      yes   
4   5  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus     price  
0              no             yes        2      yes        furnished  13300000  
1              no             yes        3       no        furnished  12250000  
2              no              no        2      yes   semi-furnished  12250000  
3              no             yes        3      yes        furnished  12215000  
4              no             yes        2       no        furnished  11410000  


In [3]:
# 데이터 정보 보기
print(housing.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
 13  price             545 non-null    int64 
dtypes: int64(7), object(7)
memory usage: 59.7+ KB
None


In [4]:
# 데이터 전처리(Data Pre-Processing) : id 열을 삭제하기
housing = housing.drop(columns = ["id"])
print(housing.head())

   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  7420         4          2        3      yes        no       no   
1  8960         4          4        4      yes        no       no   
2  9960         3          2        2      yes        no      yes   
3  7500         4          2        2      yes        no      yes   
4  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus     price  
0              no             yes        2      yes        furnished  13300000  
1              no             yes        3       no        furnished  12250000  
2              no              no        2      yes   semi-furnished  12250000  
3              no             yes        3      yes        furnished  12215000  
4              no             yes        2       no        furnished  11410000  


In [9]:
# 데이터 전처리 : 범주형 데이터 -> One-Hot Encoding, 더미변수
# mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea, furnishingstatus

# 범주형 변수 목록
categorical_cols = [
    "mainroad", 
    "guestroom", 
    "basement", 
    "hotwaterheating", 
    "airconditioning", 
    "prefarea", 
    "furnishingstatus"
]

# One-Hot Encoding 적용
housing = pd.get_dummies(housing, columns = categorical_cols, drop_first = False)

# 결과 확인
print(housing.head())
print("\n변환 후 데이터 크기:", housing.shape)

   area  bedrooms  bathrooms  stories  parking     price  mainroad_no  \
0  7420         4          2        3        2  13300000        False   
1  8960         4          4        4        3  12250000        False   
2  9960         3          2        2        2  12250000        False   
3  7500         4          2        2        3  12215000        False   
4  7420         4          1        2        2  11410000        False   

   mainroad_yes  guestroom_no  guestroom_yes  ...  basement_yes  \
0          True          True          False  ...         False   
1          True          True          False  ...         False   
2          True          True          False  ...          True   
3          True          True          False  ...          True   
4          True         False           True  ...          True   

   hotwaterheating_no  hotwaterheating_yes  airconditioning_no  \
0                True                False               False   
1                True     

In [11]:
# 데이터 분할하기 : 훈련 데이터(70%), 평가 데이터(30%)
# y : price
# X : price를 제외한 모든 열

X = housing.drop(columns = ["price"])
y = housing["price"]

# 훈련 70%, 평가 30% 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size    = 0.3, 
                                                    random_state = 1001)

# 확인
print("X_train 크기:", X_train.shape)
print("X_test 크기:",  X_test.shape)
print("y_train 크기:", y_train.shape)
print("y_test 크기:",  y_test.shape)

X_train 크기: (381, 20)
X_test 크기: (164, 20)
y_train 크기: (381,)
y_test 크기: (164,)


In [14]:
# 예측모형

# 사용할 모델들 정의
models = {
    "Linear Regression" : LinearRegression(),
    "Ridge Regression"  : Ridge(alpha = 1.0),
    "Lasso Regression"  : Lasso(alpha = 0.01),
    "Random Forest"     : RandomForestRegressor(n_estimators = 200, random_state = 42),
    "Gradient Boosting" : GradientBoostingRegressor(n_estimators = 200, random_state = 42),
    "XGBoost"           : XGBRegressor(n_estimators = 200, random_state = 42)
}

# 성능 저장용 딕셔너리
results = {}

# 각 모델 학습 및 예측
for name, model in models.items():
    model.fit(X_train, y_train)                                 # 훈련
    y_pred        = model.predict(X_test)                       # 예측
    rmse          = np.sqrt(mean_squared_error(y_test, y_pred)) # RMSE 계산
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.2f}")

# 결과를 DataFrame으로 정리
results_df = pd.DataFrame(list(results.items()), 
                          columns = ["Model", "RMSE"]).sort_values(by = "RMSE")
print("\n▶ RMSE 기준 성능 비교")
print(results_df)

Linear Regression RMSE: 1114820.37
Ridge Regression RMSE: 1115174.13
Lasso Regression RMSE: 1114820.37
Random Forest RMSE: 1084090.00
Gradient Boosting RMSE: 1127694.22
XGBoost RMSE: 1149479.37

▶ RMSE 기준 성능 비교
               Model          RMSE
3      Random Forest  1.084090e+06
0  Linear Regression  1.114820e+06
2   Lasso Regression  1.114820e+06
1   Ridge Regression  1.115174e+06
4  Gradient Boosting  1.127694e+06
5            XGBoost  1.149479e+06


In [16]:
# 예측모형의 성능을 개선하고 싶음
# (1) X 변환 : Min-Max Normalization(정규화) : X가 0~1 사이로 변경
# 1. Min-Max Scaling
scaler   = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 2. 훈련/평가 데이터 다시 분할
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size = 0.3, random_state = 42
)

# 3. 사용할 모델 정의
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42)
}

# 4. 성능 저장용 딕셔너리
results = {}

# 5. 각 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train_scaled, y_train)                       # 학습
    y_pred = model.predict(X_test_scaled)                    # 예측
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))       # RMSE 계산
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.2f}")

# 6. 결과를 DataFrame으로 정리
results_df = pd.DataFrame(list(results.items()), columns=["Model", "RMSE"]).sort_values(by="RMSE")
print("\n▶ RMSE 기준 성능 비교 (Min-Max Scaling 적용 후)")
print(results_df)

Linear Regression RMSE: 1234106.75
Ridge Regression RMSE: 1226824.36
Lasso Regression RMSE: 1234106.75
Random Forest RMSE: 1355183.56
Gradient Boosting RMSE: 1305445.02
XGBoost RMSE: 1435543.30

▶ RMSE 기준 성능 비교 (Min-Max Scaling 적용 후)
               Model          RMSE
1   Ridge Regression  1.226824e+06
0  Linear Regression  1.234107e+06
2   Lasso Regression  1.234107e+06
4  Gradient Boosting  1.305445e+06
3      Random Forest  1.355184e+06
5            XGBoost  1.435543e+06


In [18]:
# (2) 표준화(Standardization)

# 1. 표준화 (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. 훈련/평가 데이터 다시 분할 (70% / 30%)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# 3. 사용할 모델 정의
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42)
}

# 4. 성능 저장용 딕셔너리
results = {}

# 5. 각 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train_scaled, y_train)                       # 학습
    y_pred = model.predict(X_test_scaled)                    # 예측
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))       # RMSE 계산
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.2f}")

# 6. 결과를 DataFrame으로 정리
results_df = pd.DataFrame(list(results.items()), columns=["Model", "RMSE"]).sort_values(by="RMSE")
print("\n▶ RMSE 기준 성능 비교 (표준화 적용 후)")
print(results_df)

Linear Regression RMSE: 1234106.75
Ridge Regression RMSE: 1233913.80
Lasso Regression RMSE: 1234106.75
Random Forest RMSE: 1356507.63
Gradient Boosting RMSE: 1304525.31
XGBoost RMSE: 1435543.30

▶ RMSE 기준 성능 비교 (표준화 적용 후)
               Model          RMSE
1   Ridge Regression  1.233914e+06
0  Linear Regression  1.234107e+06
2   Lasso Regression  1.234107e+06
4  Gradient Boosting  1.304525e+06
3      Random Forest  1.356508e+06
5            XGBoost  1.435543e+06
