# 워싱턴 킹 카운티 주택 거래 가격

# 

## pandas의 read_csv()를 이용해 훈련셋 읽기

import numpy as np
import pandas as pd

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)
df.info()

## 예측에 필요 없다고 판단되는 열을 삭제, 타겟값을 가진 부분으로 분리

housing_prepared = df.drop(["price", "id","date", "long", "lat", "zipcode"], axis=1)
housing_labels = df["price"].copy()

housing_prepared

housing_labels

## k-폴드 CV의 평균을 계산하기 위한 함수를 정의

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## 선형회귀 모델을 사용

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id","date", "long", "lat", "zipcode"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# pycaret의 setup 함수를 사용하여 데이터셋 설정
reg_setup = setup(data=df, target='price', session_id=123)

# compare_models 함수를 사용하여 모든 가능한 모델을 비교
best_model = compare_models(sort='RMSE', n_select=1)

# 최적 모델 튜닝 및 결과 출력
tuned_model = tune_model(best_model, optimize='RMSE', n_iter=200)

# 모델 세부 정보 및 튜닝된 하이퍼파라미터 확인
print(tuned_model)

tuned_model_scores = cross_val_score(tuned_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
tuned_model_scores_rmse_scores = np.sqrt(-tuned_model_scores)
display_scores(tuned_model_scores_rmse_scores)

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id","date", "long", "lat", "zipcode"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# pycaret의 setup 함수를 사용하여 데이터셋 설정
reg_setup = setup(data=df, target='price', session_id=123)

# compare_models 함수를 사용하여 모든 가능한 모델을 비교
best_model = compare_models(sort='RMSE', n_select=15)

top15 = [rank for rank in best_model]

for top in top15:
    print("----------------------------------------------------------------")
   
    # 최적 모델 튜닝 및 결과 출력
    tuned_model = tune_model(top, optimize='RMSE', n_iter=300)

    # 모델 세부 정보 및 튜닝된 하이퍼파라미터 확인
    print(tuned_model)

    tuned_model_scores = cross_val_score(tuned_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    tuned_model_scores_rmse_scores = np.sqrt(-tuned_model_scores)
    display_scores(tuned_model_scores_rmse_scores)
   
    print("----------------------------------------------------------------")

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id","date", "long", "lat", "zipcode"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor  # XGBoost 모델 추가

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

xgboost_model = XGBRegressor(
    max_depth=3,  # 튜닝이 필요한 하이퍼파라미터는 적절히 조절해주세요.
    learning_rate=0.1,
    n_estimators=100,
    objective='reg:squarederror',  # 회귀 문제의 경우 'reg:squarederror'를 사용합니다.
    random_state=0
)

xgboost_scores = cross_val_score(xgboost_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
xgboost_rmse_scores = np.sqrt(-xgboost_scores)

print("=======================================================================================")
display_scores(xgboost_rmse_scores)
print("=======================================================================================")


import numpy as np
import pandas as pd    
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# XGBoost 모델 정의
xgboost_model = XGBRegressor(objective='reg:squarederror', random_state=0)

# 그리드 서치에 사용할 하이퍼파라미터 후보
param_grid = {
    'max_depth': [None, 3, 5, 7, 9, 15, ],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'n_estimators': [30, 50, 100, 200, 250]
}

# 그리드 서치 객체 생성
grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, 
                           scoring="neg_mean_squared_error", cv=10, verbose=2, n_jobs=-1)

# 그리드 서치 수행
grid_search.fit(housing_prepared, housing_labels)

# 최적의 하이퍼파라미터 및 모델 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

# 최적 모델에 대한 성능 평가
grid_search_scores = cross_val_score(grid_search.best_estimator_, housing_prepared, housing_labels, 
                                     scoring="neg_mean_squared_error", cv=10)
grid_search_rmse_scores = np.sqrt(-grid_search_scores)

print("=======================================================================================")
display_scores(grid_search_rmse_scores)
print("=======================================================================================")


import numpy as np
import pandas as pd
from pycaret.regression import *
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# LightGBM 모델 정의
lgbm = LGBMRegressor(random_state=0)

# 탐색할 하이퍼파라미터 그리드 정의
param_grid = {
    'max_depth': [-1, 10, 20],
    'min_data_in_leaf': [20, 30, 40],
    'feature_fraction': [0.8, 1.0],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.1155, 0.2],
    'num_leaves': [31, 40, 50]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(lgbm, param_grid, scoring="neg_mean_squared_error", cv=10, verbose=2, n_jobs=-1)

# 그리드 서치 수행
grid_search.fit(housing_prepared, housing_labels)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

# 최적 모델에 대한 성능 출력
best_model = grid_search.best_estimator_
best_scores = cross_val_score(best_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
best_rmse_scores = np.sqrt(-best_scores)

print("=======================================================================================")
display_scores(best_rmse_scores)
print("=======================================================================================")


import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

# 특성공학 함수 정의
def feature_engineering(data):
    # 1. 연도 관련 특성
    data['house_age'] = 2023 - data['yr_built']
    data['years_since_renovation'] = 2023 - data['yr_renovated']

    # 2. 면적 관련 특성
    data['indoor_to_outdoor_ratio'] = data['sqft_living'] / data['sqft_lot']
    data['basement_to_living_ratio'] = data['sqft_basement'] / data['sqft_living']

    # 3. 위치 관련 특성
    # (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
    data['location'] = data['lat'] + data['long']

    # 4. 각종 비율 관련 특성
    data['bedrooms_to_bathrooms_ratio'] = data['bedrooms'] / data['bathrooms']
    data['living_to_floors_ratio'] = data['sqft_living'] / data['floors']

    # 5. 우편번호 관련 특성
    # (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄)
    zipcode_prices = data.groupby('zipcode')['price'].mean().to_dict()
    data['average_price_by_zipcode'] = data['zipcode'].map(zipcode_prices)
    
    # 6. 태원1
    data['div_sqft_living_waterfront'] = data['sqft_living'] / data['waterfront'] / 2
    
    # 7. 태원2
    data['sum_sqft_living_waterfront'] = data['sqft_living'] + data['waterfront']

    # 8. 태원3
    data['sum_sqft_living_yr_renovated'] = data['sqft_living'] + data['yr_renovated']
    
    # 9. 태원4
    data['div_sqft_living_yr_renovated'] = data['sqft_living'] / data['yr_renovated'] / 2
    
    return data

# 특성공학 적용
df = feature_engineering(df)

# 결측치가 있는 열 삭제
df = df.dropna(axis=1)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
# LightGBM 모델 정의
lgbm = LGBMRegressor(random_state=0)

# 탐색할 하이퍼파라미터 그리드 정의
param_grid = {
    'max_depth': [-1, 10, 20],
    'min_data_in_leaf': [20, 30, 40],
    'feature_fraction': [0.8, 1.0],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.1155, 0.2],
    'num_leaves': [31, 40, 50]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(lgbm, param_grid, scoring="neg_mean_squared_error", cv=10, verbose=2, n_jobs=-1)

# 그리드 서치 수행
grid_search.fit(housing_prepared, housing_labels)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

# 최적 모델에 대한 성능 출력
best_model = grid_search.best_estimator_
best_scores = cross_val_score(best_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
best_rmse_scores = np.sqrt(-best_scores)

print("=======================================================================================")
display_scores(best_rmse_scores)
print("=======================================================================================")


import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

# 특성공학 함수 정의
def feature_engineering(data):
    # 1. 연도 관련 특성
    data['house_age'] = 2023 - data['yr_built']
    data['years_since_renovation'] = 2023 - data['yr_renovated']

    # 2. 면적 관련 특성
    data['indoor_to_outdoor_ratio'] = data['sqft_living'] / data['sqft_lot']
    data['basement_to_living_ratio'] = data['sqft_basement'] / data['sqft_living']

    # 3. 위치 관련 특성
    # (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
    data['location'] = data['lat'] + data['long']

    # 4. 각종 비율 관련 특성
    data['bedrooms_to_bathrooms_ratio'] = data['bedrooms'] / data['bathrooms']
    data['living_to_floors_ratio'] = data['sqft_living'] / data['floors']

    # 5. 우편번호 관련 특성
    # (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄)
    zipcode_prices = data.groupby('zipcode')['price'].mean().to_dict()
    data['average_price_by_zipcode'] = data['zipcode'].map(zipcode_prices)
    
    # 6. 태원1
    data['div_sqft_living_waterfront'] = data['sqft_living'] / data['waterfront'] / 2
    
    # 7. 태원2
    data['sum_sqft_living_waterfront'] = data['sqft_living'] + data['waterfront']

    # 8. 태원3
    data['sum_sqft_living_yr_renovated'] = data['sqft_living'] + data['yr_renovated']
    
    # 9. 태원4
    data['div_sqft_living_yr_renovated'] = data['sqft_living'] / data['yr_renovated'] / 2
    
    return data

# 특성공학 적용
df = feature_engineering(df)

# Check and handle infinite or large values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)  # Drop rows with NaN values

# Check the target variable
print(df['price'].describe())

# Prepare data for modeling
housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# PyCaret setup
reg_setup = setup(data=df, target='price', ignore_features=['id'])

# Compare models
best_model = compare_models(fold=10, sort='RMSE')

# Tune the best model
tuned_model = tune_model(best_model, fold=10, n_iter=500)

# Display summary information for the tuned model
print(tuned_model)

# Function to display cross-validation scores
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# Fit the tuned model
tuned_model.fit(housing_prepared, housing_labels)

# Evaluate the tuned model using cross-validation
best_scores = cross_val_score(tuned_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
best_rmse_scores = np.sqrt(-best_scores)

# Display the performance metrics
print("=======================================================================================")
display_scores(best_rmse_scores)
print("=======================================================================================")

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

# 특성공학 함수 정의
def feature_engineering(data):
    # 1. 연도 관련 특성
    data['house_age'] = 2023 - data['yr_built']
    data['years_since_renovation'] = 2023 - data['yr_renovated']

    # 2. 면적 관련 특성
    data['indoor_to_outdoor_ratio'] = data['sqft_living'] / data['sqft_lot']
    data['basement_to_living_ratio'] = data['sqft_basement'] / data['sqft_living']

    # 3. 위치 관련 특성
    # (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
    data['location'] = data['lat'] + data['long']

    # 4. 각종 비율 관련 특성
    data['bedrooms_to_bathrooms_ratio'] = data['bedrooms'] / data['bathrooms']
    data['living_to_floors_ratio'] = data['sqft_living'] / data['floors']

    # 5. 우편번호 관련 특성
    # (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄)
    zipcode_prices = data.groupby('zipcode')['price'].mean().to_dict()
    data['average_price_by_zipcode'] = data['zipcode'].map(zipcode_prices)
    
    # 6. 태원1
    data['div_sqft_living_waterfront'] = data['sqft_living'] / data['waterfront'] / 2
    
    # 7. 태원2
    data['sum_sqft_living_waterfront'] = data['sqft_living'] + data['waterfront']

    # 8. 태원3
    data['sum_sqft_living_yr_renovated'] = data['sqft_living'] + data['yr_renovated']
    
    # 9. 태원4
    data['div_sqft_living_yr_renovated'] = data['sqft_living'] / data['yr_renovated'] / 2
    
    return data

# 특성공학 적용
df = feature_engineering(df)

# Check and handle infinite or large values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)  # Drop rows with NaN values

# Check the target variable
print(df['price'].describe())

# Prepare data for modeling
housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# PyCaret setup
reg_setup = setup(data=df, target='price', ignore_features=['id', 'date'])

# Compare models
best_model = compare_models(fold=10, sort='RMSE')

# Tune the best model
tuned_model = tune_model(best_model, fold=10, n_iter=500)

# Display summary information for the tuned model
print(tuned_model)

# Function to display cross-validation scores
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# Fit the tuned model
tuned_model.fit(housing_prepared, housing_labels)

# Evaluate the tuned model using cross-validation
best_scores = cross_val_score(tuned_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
best_rmse_scores = np.sqrt(-best_scores)

# Display the performance metrics
print("=======================================================================================")
display_scores(best_rmse_scores)
print("=======================================================================================")

import numpy as np
import pandas as pd    
from pycaret.regression import *
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

input_file = "kc_house_data.csv"
df = pd.read_csv(input_file)

# 특성공학 함수 정의
def feature_engineering(data):
    # 1. 연도 관련 특성
    data['house_age'] = 2023 - data['yr_built']
    data['years_since_renovation'] = 2023 - data['yr_renovated']

    # 2. 면적 관련 특성
    data['indoor_to_outdoor_ratio'] = data['sqft_living'] / data['sqft_lot']
    data['basement_to_living_ratio'] = data['sqft_basement'] / data['sqft_living']

    # 3. 위치 관련 특성
    # (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
    data['location'] = data['lat'] + data['long']

    # 4. 각종 비율 관련 특성
    data['bedrooms_to_bathrooms_ratio'] = data['bedrooms'] / data['bathrooms']
    data['living_to_floors_ratio'] = data['sqft_living'] / data['floors']

    # 5. 우편번호 관련 특성
    # (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄)
    zipcode_prices = data.groupby('zipcode')['price'].mean().to_dict()
    data['average_price_by_zipcode'] = data['zipcode'].map(zipcode_prices)
    
    # 6. 태원1
    data['div_sqft_living_waterfront'] = data['sqft_living'] / data['waterfront'] / 2
    
    # 7. 태원2
    data['sum_sqft_living_waterfront'] = data['sqft_living'] + data['waterfront']

    # 8. 태원3
    data['sum_sqft_living_yr_renovated'] = data['sqft_living'] + data['yr_renovated']
    
    # 9. 태원4
    data['div_sqft_living_yr_renovated'] = data['sqft_living'] / data['yr_renovated'] / 2
    
    return data

# 특성공학 적용
df = feature_engineering(df)

housing_prepared = df.drop(["price", "id", "date"], axis=1)
housing_labels = df["price"].copy()

# LightGBM 모델 정의
lgbm = LGBMRegressor(random_state=0)

# Extra Trees 모델 정의
extratree = ExtraTreesRegressor(random_state=0)

# VotingRegressor 정의
voting_regressor = VotingRegressor(estimators=[('lgbm', lgbm), ('extratree', extratree)])

# VotingRegressor에 대한 탐색할 하이퍼파라미터 그리드 정의
param_grid = {
    'lgbm__max_depth': [-1, 10, 20],
    'lgbm__min_data_in_leaf': [20, 30, 40],
    'lgbm__feature_fraction': [0.8, 1.0],
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__learning_rate': [0.1, 0.1155, 0.2],
    'lgbm__num_leaves': [31, 40, 50],
    
    'extratree__n_estimators': [50, 100, 200],
    'extratree__max_depth': [None, 10, 20],
    'extratree__min_samples_split': [2, 5, 10],
    'extratree__min_samples_leaf': [1, 2, 4]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(voting_regressor, param_grid, scoring="neg_mean_squared_error", cv=10, verbose=2, n_jobs=-1)

# 그리드 서치 수행
grid_search.fit(housing_prepared, housing_labels)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

# 최적 모델에 대한 성능 출력
best_model = grid_search.best_estimator_
best_scores = cross_val_score(best_model, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
best_rmse_scores = np.sqrt(-best_scores)

print("=======================================================================================")
display_scores(best_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

filtered_train_data['t1'] = filtered_train_data['zipcode']**2 * np.log(np.where(filtered_train_data['yr_built'] != 0, filtered_train_data['yr_built'], 1))
filtered_train_data['t3'] = filtered_train_data['lat']**2 * filtered_train_data['long']**3
filtered_train_data['t4'] = filtered_train_data['bathrooms']**3 * filtered_train_data['sqft_lot']
filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

filtered_train_data['t1'] = filtered_train_data['zipcode']**2 * np.log(filtered_train_data['yr_built'])
filtered_train_data['t3'] = filtered_train_data['lat']**2 * filtered_train_data['long']**3
filtered_train_data['t4'] = filtered_train_data['bathrooms']**3 * filtered_train_data['sqft_lot']
filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)

print("특성공학 시작")
# 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

filtered_train_data['t1'] = filtered_train_data['zipcode']**2 * np.log(filtered_train_data['yr_built'])
filtered_train_data['t3'] = filtered_train_data['lat']**2 * filtered_train_data['long']**3
filtered_train_data['t4'] = filtered_train_data['bathrooms']**3 * filtered_train_data['sqft_lot']
filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)
print("특성공학 종료")

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

from sklearn.impute import SimpleImputer

# Replace infinity with NaN
sh_t_data = np.where(np.isinf(sh_t_data), np.nan, sh_t_data)

# Calculate mean for each column (ignoring NaN values)
column_means = np.nanmean(sh_t_data, axis=0)

# Create an imputer that replaces NaN with the mean for each column
imputer = SimpleImputer(strategy="mean", missing_values=np.nan)
sh_t_data = imputer.fit_transform(sh_t_data)

scaler = StandardScaler()
sh_t_data_scaled = scaler.fit_transform(sh_t_data)

# 그리드 서치를 위한 ExtraTreesRegressor 하이퍼파라미터 그리드
et_param_grid = {
    'n_estimators': [20, 50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4]
}

print("Extratree 그리드 서치 진행")
# ExtraTreesRegressor 그리드 서치 객체 생성
et_grid_search = GridSearchCV(ExtraTreesRegressor(random_state=0), et_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
et_grid_search.fit(sh_t_data_scaled, sh_t_target)
print("Extratree 그리드 서치 종료")

# 최적의 ExtraTreesRegressor 모델
best_et_model = et_grid_search.best_estimator_
print(f"best_et_model: {best_et_model}")

# LGBMRegressor 그리드 서치를 위한 하이퍼파라미터 그리드
lgbm_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.03, 0.1, 0.2],
    'num_leaves': [20, 30, 40, 50],
    'max_depth': [None, 10, 20, 30],
}

print("LGBM 그리드 서치 진행")
# LGBMRegressor 그리드 서치 객체 생성
lgbm_grid_search = GridSearchCV(LGBMRegressor(random_state=0), lgbm_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
lgbm_grid_search.fit(sh_t_data_scaled, sh_t_target)
print("LGBM 그리드 서치 종료")

# 최적의 LGBMRegressor 모델
best_lgbm_model = lgbm_grid_search.best_estimator_
print(f"best_lgbm_model: {best_lgbm_model}")

# VotingRegressor 생성
voting_regressor = VotingRegressor(
    estimators=[('et', best_et_model), ('lgbm', best_lgbm_model)]
)

# VotingRegressor 학습
voting_regressor.fit(sh_t_data_scaled, sh_t_target)

# VotingRegressor의 성능 평가
voting_scores = cross_val_score(voting_regressor, sh_t_data_scaled, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
voting_rmse_scores = np.sqrt(-voting_scores)

print("=======================================================================================")
print("Voting Regressor Scores:")
display_scores(voting_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)

print("특성공학 시작")
# 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].items():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].items():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

filtered_train_data['t1'] = filtered_train_data['zipcode']**2 * np.log(filtered_train_data['yr_built'])
filtered_train_data['t3'] = filtered_train_data['lat']**2 * filtered_train_data['long']**3
filtered_train_data['t4'] = filtered_train_data['bathrooms']**3 * filtered_train_data['sqft_lot']
filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)
print("특성공학 종료")

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

from sklearn.impute import SimpleImputer

# Replace infinity with NaN
sh_t_data = np.where(np.isinf(sh_t_data), np.nan, sh_t_data)

# Calculate mean for each column (ignoring NaN values)
column_means = np.nanmean(sh_t_data, axis=0)

# Create an imputer that replaces NaN with the mean for each column
imputer = SimpleImputer(strategy="mean", missing_values=np.nan)
sh_t_data = imputer.fit_transform(sh_t_data)

scaler = StandardScaler()
sh_t_data_scaled = scaler.fit_transform(sh_t_data)

# 그리드 서치를 위한 ExtraTreesRegressor 하이퍼파라미터 그리드
et_param_grid = {
    'n_estimators': [20, 30, 50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 4, 6, 8, 12],
    'min_samples_leaf': [1, 2, 4, 8]
}

print("Extratree 그리드 서치 진행")
# ExtraTreesRegressor 그리드 서치 객체 생성
et_grid_search = GridSearchCV(ExtraTreesRegressor(random_state=0), et_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
et_grid_search.fit(sh_t_data_scaled, sh_t_target)
print("Extratree 그리드 서치 종료")

# 최적의 ExtraTreesRegressor 모델
best_et_model = et_grid_search.best_estimator_
print(f"best_et_model: {best_et_model}")

# LGBMRegressor 그리드 서치를 위한 하이퍼파라미터 그리드
lgbm_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'num_leaves': [20, 30, 40, 50, 80, 100],
    'max_depth': [None, 10, 20, 30],
}

print("LGBM 그리드 서치 진행")
# LGBMRegressor 그리드 서치 객체 생성
lgbm_grid_search = GridSearchCV(LGBMRegressor(random_state=0), lgbm_param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
lgbm_grid_search.fit(sh_t_data_scaled, sh_t_target)
print("LGBM 그리드 서치 종료")

# 최적의 LGBMRegressor 모델
best_lgbm_model = lgbm_grid_search.best_estimator_
print(f"best_lgbm_model: {best_lgbm_model}")

# VotingRegressor 생성
voting_regressor = VotingRegressor(
    estimators=[('et', best_et_model), ('lgbm', best_lgbm_model)]
)

# VotingRegressor 학습
voting_regressor.fit(sh_t_data_scaled, sh_t_target)

# VotingRegressor의 성능 평가
voting_scores = cross_val_score(voting_regressor, sh_t_data_scaled, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
voting_rmse_scores = np.sqrt(-voting_scores)

print("=======================================================================================")
print("Voting Regressor Scores:")
display_scores(voting_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)
# housing_train = housing_train.drop("date", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

housing_train=housing_train[housing_train['bedrooms']!=33]

def remove_outliers(df):
    '''removes entries with z-score above 3 for specific columns'''
    variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 
                 'lat', 'long', 'sqft_living15', 'sqft_lot15']
    
    for variable in variables:
        df = df[np.abs(df[variable]-df[variable].mean()) <= (3*df[variable].std())]
        
    return df

housing_train = remove_outliers(housing_train)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm.fit(sh_t_data, sh_t_target)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=4)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

housing_train = housing_train.drop("id", axis=1)
# housing_train = housing_train.drop("date", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()

housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_train = housing_train[housing_train['bedrooms'] != 8]

housing_train = housing_train.drop(housing_train[housing_train["years_since_renovation"]<500].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["view"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["Totalviews"]>0.5].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["condition"]<0.3].index, axis=0)
housing_train = housing_train.drop(housing_train[housing_train["sqft_basement"]>470].index, axis=0)

count = 0
bath_outliers = []
mean = np.mean(housing_train['bathrooms'])
max_distance = np.std(housing_train['bathrooms']) * 3 

for idx, row in housing_train['bathrooms'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)

count = 0
bath_outliers = []
mean = np.mean(housing_train['grade'])
max_distance = np.std(housing_train['grade']) * 3

for idx, row in housing_train['grade'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms'])
max_distance = np.std(housing_train['bedrooms']) * 3

for idx, row in housing_train['bedrooms'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_living'])
max_distance = np.std(housing_train['sqft_living']) * 3

for idx, row in housing_train['sqft_living'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot'])
max_distance = np.std(housing_train['sqft_lot']) * 3

for idx, row in housing_train['sqft_lot'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['(sqft_living*grade)/2'])
max_distance = np.std(housing_train['(sqft_living*grade)/2']) * 3

for idx, row in housing_train['(sqft_living*grade)/2'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['sqft_lot15'])
max_distance = np.std(housing_train['sqft_lot15']) * 3

for idx, row in housing_train['sqft_lot15'].T.iteritems():
    if abs(row-mean) >= max_distance:
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['bedrooms_to_bathrooms_ratio'])
max_distance = np.std(housing_train['bedrooms_to_bathrooms_ratio']) * 3

for idx, row in housing_train['bedrooms_to_bathrooms_ratio'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

count = 0
bath_outliers = []
mean = np.mean(housing_train['condition'])
max_distance = np.std(housing_train['condition']) * 3

for idx, row in housing_train['condition'].T.iteritems():
    if abs(row-mean) >= max_distance: 
        count += 1
        housing_train.drop(idx, inplace=True)
count

housing_train=housing_train[housing_train['bedrooms']!=33]

# Z-스코어를 계산하여 이상치 및 무한대 값을 제거하는 함수
def remove_outliers_zscore(df, features, threshold=3):
    z_scores = np.abs(stats.zscore(df[features]))
    outlier_mask = (z_scores > threshold).any(axis=1)
    df_cleaned = df[~outlier_mask]
    return df_cleaned

# 위에서 정의한 함수를 사용하여 이상치 및 무한대 값을 제거
features_to_check = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 
                      'lat', 'long', 'sqft_living15', 'sqft_lot15', '(sqft_living*grade)/2']

housing_train = remove_outliers_zscore(housing_train, features_to_check)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)
#filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

filtered_train_data['t5'] = filtered_train_data['sqft_living15']**2 / np.sqrt(filtered_train_data['sqft_lot15'])
filtered_train_data = filtered_train_data.drop("sqft_living15", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

train_target = housing_train["price"].copy()

corr_matrix = housing_train.corr()
corr_matrix["price"].sort_values(ascending=False)

filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
filtered_train_data = filtered_train_data.drop('Totalviews',axis=1)
filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm.fit(sh_t_data, sh_t_target)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=4)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

# os.chdir('C:/Users/kksp1/OneDrive/Desktop/3_2학기 해야할 일/수업/3-2/머신러닝/미니프로젝트/풀젝_3')
#print(os.getcwd())
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)


housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']


ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환


housing_train['date'] = pd.to_datetime(housing_train['date'] )
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
# housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
# housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living']*housing_train['grade'])/2


housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.dropna()
filtered_train_data = filtered_train_data.drop("date", axis=1)
filtered_train_data = filtered_train_data.drop("price", axis=1)

train_target = housing_train["price"].copy()
#filtered_train_data = filtered_train_data.drop('waterfront',axis=1)
filtered_train_data = filtered_train_data.drop('yr_renovated',axis=1)
filtered_train_data = filtered_train_data.drop('years_since_renovation',axis=1)
# filtered_train_data = filtered_train_data.drop('tr_day_name',axis=1)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = -1, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
import pandas as np
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV


# Common imports
import numpy as np

import matplotlib.pyplot as plt

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train = pd.read_csv('kc_house_data.csv')

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 5. 우편번호 관련 특성
# (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄) ##### 삭제해야할 가능성있음
zipcode_prices = housing_train.groupby('zipcode')['price'].mean().to_dict()
housing_train['average_price_by_zipcode'] = housing_train['zipcode'].map(zipcode_prices)

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toal_sizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15'] + housing_train['sqft_lot']

housing_train['views+basement'] = housing_train['view'] + housing_train['sqft_basement']

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'])
# housing_train["tr_year"] = housing_train["date"].dt.year
# housing_train["tr_month"] = housing_train["date"].dt.month
# housing_train["tr_day"] = housing_train["date"].dt.day

housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m-%d')
# housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
# housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
# housing_train["tr_day"] = ordinal_encoder.fit_transform(housing_train[["tr_day"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_target = housing_train['price']
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)


X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

housing_train = housing_train.drop('price', axis=1)

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(housing_train))
sh_t_data = housing_train.values[sh_in]
sh_t_target = housing_target.values[sh_in]

# LightGBM 모델 정의
lgbm = LGBMRegressor(random_state=0)

# Extra Trees 모델 정의
extratree = ExtraTreesRegressor(random_state=0)

# 그리드 서치 파라미터 설정
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
}

param_grid_extratree = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
}

# LightGBM 그리드 서치
grid_search_lgbm = GridSearchCV(lgbm, param_grid_lgbm, scoring="neg_mean_squared_error", cv=10, n_jobs=-1, verbose=2)
grid_search_lgbm.fit(sh_t_data, sh_t_target)

# Extra Trees 그리드 서치
grid_search_extratree = GridSearchCV(extratree, param_grid_extratree, scoring="neg_mean_squared_error", cv=10, n_jobs=-1, verbose=2)
grid_search_extratree.fit(sh_t_data, sh_t_target)

# 최적의 파라미터 출력
print("Best parameters for LightGBM:", grid_search_lgbm.best_params_)
print("Best parameters for Extra Trees:", grid_search_extratree.best_params_)

# 최적의 파라미터를 사용하여 모델 재설정
best_lgbm = grid_search_lgbm.best_estimator_
best_extratree = grid_search_extratree.best_estimator_

# 보팅 리그레서 정의
voting_reg = VotingRegressor(estimators=[('lgbm', best_lgbm), ('extratree', best_extratree)])

# 보팅 리그레서 평가
voting_scores = cross_val_score(voting_reg, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
voting_rmse_scores = np.sqrt(-voting_scores)

# 결과 출력
print("=======================================================================================")
display_scores(voting_rmse_scores)
print("=======================================================================================")

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
import pandas as np
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV


# Common imports
import numpy as np

import matplotlib.pyplot as plt

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train = pd.read_csv('kc_house_data.csv')

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 5. 우편번호 관련 특성
# (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄) ##### 삭제해야할 가능성있음
zipcode_prices = housing_train.groupby('zipcode')['price'].mean().to_dict()
housing_train['average_price_by_zipcode'] = housing_train['zipcode'].map(zipcode_prices)

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toal_sizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15'] + housing_train['sqft_lot']

housing_train['views+basement'] = housing_train['view'] + housing_train['sqft_basement']

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'])
# housing_train["tr_year"] = housing_train["date"].dt.year
# housing_train["tr_month"] = housing_train["date"].dt.month
# housing_train["tr_day"] = housing_train["date"].dt.day

housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m-%d')
# housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
# housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
# housing_train["tr_day"] = ordinal_encoder.fit_transform(housing_train[["tr_day"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_target = housing_train['price']
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)


X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

housing_train = housing_train.drop('price', axis=1)

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(housing_train))
sh_t_data = housing_train.values[sh_in]
sh_t_target = housing_target.values[sh_in]

# XGBoost 모델 import
from xgboost import XGBRegressor

# XGBoost 모델 정의
xgb = XGBRegressor(
    max_depth=6,  # max_depth 설정
    learning_rate=0.1,
    n_estimators=100,
    min_child_weight=1,  # min_child_weight 설정
    subsample=1.0,
    colsample_bytree=1.0,
    gamma=0.0,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective='reg:squarederror',  # regression용 목적 함수
    random_state=0
)

xgb_scores = cross_val_score(xgb, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

print("=======================================================================================")
display_scores(xgb_rmse_scores)
print("=======================================================================================")

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
import pandas as np
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV


# Common imports
import numpy as np

import matplotlib.pyplot as plt

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train = pd.read_csv('kc_house_data.csv')

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
housing_train = housing_train.drop("id", axis=1)

# # 1. 연도 관련 특성
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

# 2. 면적 관련 특성
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

# 3. 위치 관련 특성
# (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
housing_train['location'] = housing_train['lat'] + housing_train['long']

# 4. 각종 비율 관련 특성
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

# 5. 우편번호 관련 특성
# (이 예시에서는 단순히 우편번호를 이용하여 평균 가격을 나타냄) ##### 삭제해야할 가능성있음
zipcode_prices = housing_train.groupby('zipcode')['price'].mean().to_dict()
housing_train['average_price_by_zipcode'] = housing_train['zipcode'].map(zipcode_prices)

# 전망 관련 특성 합
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toal_sizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15'] + housing_train['sqft_lot']

housing_train['views+basement'] = housing_train['view'] + housing_train['sqft_basement']

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

housing_train['date'] = pd.to_datetime(housing_train['date'])
# housing_train["tr_year"] = housing_train["date"].dt.year
# housing_train["tr_month"] = housing_train["date"].dt.month
# housing_train["tr_day"] = housing_train["date"].dt.day

housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m-%d')
# housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
# housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
# housing_train["tr_day"] = ordinal_encoder.fit_transform(housing_train[["tr_day"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_target = housing_train['price']
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)


X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

housing_train = housing_train.drop('price', axis=1)

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(housing_train))
sh_t_data = housing_train.values[sh_in]
sh_t_target = housing_target.values[sh_in]

# XGBoost 모델 import
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# XGBoost 모델 정의
xgb = XGBRegressor(random_state=0)

# 그리드 서치 파라미터 설정
param_grid = {
    'max_depth': [3, 6, 9],  # max_depth 설정
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'min_child_weight': [1, 3, 5],  # min_child_weight 설정
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0.0, 0.1, 0.2],
    'reg_alpha': [0.0, 0.1, 0.2],
    'reg_lambda': [0.8, 1.0, 1.2]
}

# XGBoost 그리드 서치
grid_search_xgb = GridSearchCV(xgb, param_grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
grid_search_xgb.fit(sh_t_data, sh_t_target)

# 최적의 파라미터 출력
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)

# 최적의 파라미터를 사용하여 모델 재설정
best_xgb = grid_search_xgb.best_estimator_

# XGBoost 모델 평가
xgb_scores = cross_val_score(best_xgb, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# 결과 출력
print("=======================================================================================")
display_scores(xgb_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 제거
for column in housing_train.columns:
    # 이상치 제거
    mean_value = housing_train[column].mean()
    std_value = housing_train[column].std()
    outlier_threshold = 3  # 이상치 판단을 위한 임계값
    housing_train = housing_train[~((housing_train[column] - mean_value).abs() > outlier_threshold * std_value)]

def remove_outliers(df):
    variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
    for variable in variables:
        df = df[np.abs(df[variable] - df[variable].mean()) <= (3 * df[variable].std())]
    return df

housing_train = remove_outliers(housing_train)

# 여기까지 수정된 부분
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()

# XGBoost 모델 초기화
xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델
best_xgb_model = grid_search.best_estimator_
print(f"best_xgb_model : {best_xgb_model}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
housing_train

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 대체
for column in housing_train.columns:
    # 이상치 판단
    mean_value = housing_train[column].mean()
    std_value = housing_train[column].std()
    outlier_threshold = 4  # 이상치 판단을 위한 임계값
    
    # 이상치를 특성의 중간값으로 대체
    housing_train[column] = np.where(
        (housing_train[column] - mean_value).abs() > outlier_threshold * std_value,
        housing_train[column].median(),
        housing_train[column]
    )



imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()

filtered_train_data

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 대체
for column in housing_train.columns:
    # 이상치 판단
    mean_value = housing_train[column].mean()
    std_value = housing_train[column].std()
    outlier_threshold = 4  # 이상치 판단을 위한 임계값
    
    # 이상치를 특성의 중간값으로 대체
    housing_train[column] = np.where(
        (housing_train[column] - mean_value).abs() > outlier_threshold * std_value,
        housing_train[column].median(),
        housing_train[column]
    )

# 여기서 remove_outliers 함수가 호출되기 전에 이미 이상치가 처리되었으므로 주석 처리
# def replace_outliers_with_median(df):
#     variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
#     for variable in variables:
#         mean_value = df[variable].mean()
#         std_value = df[variable].std()
#         outlier_threshold = 3  # 이상치 판단을 위한 임계값
        
#         # 이상치를 특성의 중간값으로 대체
#         df[variable] = np.where(
#             (df[variable] - mean_value).abs() > outlier_threshold * std_value,
#             df[variable].median(),
#             df[variable]
#         )
#     return df
# housing_train = remove_outliers(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()

filtered_train_data

# XGBoost 모델 초기화
xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델
best_xgb_model = grid_search.best_estimator_
print(f"best_xgb_model : {best_xgb_model}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

for i in range(2, 10):
    print("===========================================================================================================")
    print(f"{i}번째 반복. 즉, i: {i}")
    # 이상치 대체
    for column in housing_train.columns:
        # 이상치 판단
        mean_value = housing_train[column].mean()
        std_value = housing_train[column].std()
        outlier_threshold = i  # 이상치 판단을 위한 임계값

        # 이상치를 특성의 중간값으로 대체
        housing_train[column] = np.where(
            (housing_train[column] - mean_value).abs() > outlier_threshold * std_value,
            housing_train[column].median(),
            housing_train[column]
        )

    # 여기서 remove_outliers 함수가 호출되기 전에 이미 이상치가 처리되었으므로 주석 처리
    # def replace_outliers_with_median(df):
    #     variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
    #     for variable in variables:
    #         mean_value = df[variable].mean()
    #         std_value = df[variable].std()
    #         outlier_threshold = 3  # 이상치 판단을 위한 임계값

    #         # 이상치를 특성의 중간값으로 대체
    #         df[variable] = np.where(
    #             (df[variable] - mean_value).abs() > outlier_threshold * std_value,
    #             df[variable].median(),
    #             df[variable]
    #         )
    #     return df
    # housing_train = remove_outliers(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

    # 목표 변수
    train_target = housing_train["price"].copy()

    print(f"filtered_train_data.shpae: {filtered_train_data.shape}")

    # XGBoost 모델 초기화
    xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)

    # 그리드 서치를 위한 하이퍼파라미터 그리드 정의
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }

    # GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
    grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(filtered_train_data, train_target)

    # 최적의 모델
    best_xgb_model = grid_search.best_estimator_
    print(f"best_xgb_model : {best_xgb_model}")

    # Cross-validation 결과 출력
    xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_rmse_scores = np.sqrt(-xgb_scores)

    # Display results
    print("=======================================================================================")
    # Cross-validation 결과 출력
    display_scores(xgb_rmse_scores)
    print("=======================================================================================")
    print("===========================================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

for i in range(2, 10):
    print("===========================================================================================================")
    print(f"{i}번째 반복. 즉, i: {i}")
    # 이상치 대체
    for column in housing_train.columns:
        # 이상치 판단
        mean_value = housing_train[column].mean()
        std_value = housing_train[column].std()
        outlier_threshold = i  # 이상치 판단을 위한 임계값

        # 이상치를 특성의 중간값으로 대체
        housing_train[column] = np.where(
            (housing_train[column] - mean_value).abs() > outlier_threshold * std_value,
            housing_train[column].median(),
            housing_train[column]
        )

    # 여기서 remove_outliers 함수가 호출되기 전에 이미 이상치가 처리되었으므로 주석 처리
    # def replace_outliers_with_median(df):
    #     variables = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
    #     for variable in variables:
    #         mean_value = df[variable].mean()
    #         std_value = df[variable].std()
    #         outlier_threshold = 3  # 이상치 판단을 위한 임계값

    #         # 이상치를 특성의 중간값으로 대체
    #         df[variable] = np.where(
    #             (df[variable] - mean_value).abs() > outlier_threshold * std_value,
    #             df[variable].median(),
    #             df[variable]
    #         )
    #     return df
    # housing_train = remove_outliers(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

    # 목표 변수
    train_target = housing_train["price"].copy()

    print(f"filtered_train_data.shpae: {filtered_train_data.shape}")

    # XGBoost 모델 초기화
    xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)

    # 그리드 서치를 위한 하이퍼파라미터 그리드 정의
    param_grid = {
        'n_estimators': [50, 100, 200, 300, 500],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7],
        'max_depth': [None, 3, 4, 5, 10, 30, 50]
    }

    # GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
    grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(filtered_train_data, train_target)

    # 최적의 모델
    best_xgb_model = grid_search.best_estimator_
    print(f"best_xgb_model : {best_xgb_model}")

    # Cross-validation 결과 출력
    xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_rmse_scores = np.sqrt(-xgb_scores)

    # Display results
    print("=======================================================================================")
    # Cross-validation 결과 출력
    display_scores(xgb_rmse_scores)
    print("=======================================================================================")
    print("===========================================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

for i in np.arange(1, 15, 2):

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
#             print(z_scores[column])
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)
    
    # 목표 변수
    train_target = housing_train["price"].copy()
    
    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", 'price'], axis=1)

    # 특성 공학
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
    housing_train['location'] = housing_train['lat'] + housing_train['long']
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    ordinal_encoder = OrdinalEncoder()
    housing_train['date'] = pd.to_datetime(housing_train['date'])
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["tr_day_name"] = housing_train["date"].dt.day_name
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
    housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
    housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

    # XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
    xgb_overfit = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=1000,          # 트리의 개수 (나무의 수)
        learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
        max_depth=10,               # 트리의 최대 깊이
        min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
        gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
        subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
        colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
    )
    
    # Cross-validation 결과 출력
    xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

    # Display results
    print("=======================================================================================")
    print(f"임계값: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_overfit_rmse_scores)
    print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

for i in np.arange(3, 15, 2):

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
#             print(z_scores[column])
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)
    
    # 목표 변수
    train_target = housing_train["price"].copy()
    
    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", 'price'], axis=1)

    # 특성 공학
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
    housing_train['location'] = housing_train['lat'] + housing_train['long']
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    ordinal_encoder = OrdinalEncoder()
    housing_train['date'] = pd.to_datetime(housing_train['date'])
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["tr_day_name"] = housing_train["date"].dt.day_name
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
    housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
    housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

    # 특성들의 z-score 계산
    z_scores = np.abs((filtered_train_data - filtered_train_data.mean()) / filtered_train_data.std())

    # 특성들의 z-score를 시각화
    plt.figure(figsize=(12, 6))
    z_scores.boxplot(vert=False)
    plt.title(f"Z-Scores for Threshold {i}")
    plt.show()
        
    # XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
    xgb_overfit = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=1000,          # 트리의 개수 (나무의 수)
        learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
        max_depth=10,               # 트리의 최대 깊이
        min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
        gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
        subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
        colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
    )
    
    # Cross-validation 결과 출력
    xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

    # Display results
    print("=======================================================================================")
    print(f"임계값: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_overfit_rmse_scores)
    print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

for i in np.arange(1, 15, 1):

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
#             print(z_scores[column])
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)
    
    # 목표 변수
    train_target = housing_train["price"].copy()
    
    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", 'price'], axis=1)

    # 1. 연도 관련 특성
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']

    # 2. 면적 관련 특성
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']

    # 3. 위치 관련 특성
    # (이 예시에서는 단순히 위도와 경도의 조합으로 나타냄)
    housing_train['location'] = housing_train['lat'] + housing_train['long']

    # 4. 각종 비율 관련 특성
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

    # 전만 관련 특성 합
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    from sklearn.preprocessing import OrdinalEncoder

    ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환


    housing_train['date'] = pd.to_datetime(housing_train['date'] )
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

    # 
    housing_train["house_per_bathrooms"] = 1/housing_train["bathrooms"]
    housing_train["house_per_bedrooms"] = 1/housing_train["bedrooms"]
    housing_train["house_per_sqft_livings"] = 1/(housing_train["sqft_living"]+housing_train["sqft_living15"])

    #
    housing_train["house_per_bathrooms_sq"] = (1/housing_train["bathrooms"])**2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)
        
    # XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
    xgb_overfit = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=1000,          # 트리의 개수 (나무의 수)
        learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
        max_depth=10,               # 트리의 최대 깊이
        min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
        gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
        subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
        colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
    )
    
    # Cross-validation 결과 출력
    xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

    # Display results
    print("=======================================================================================")
    print(f"임계값: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_overfit_rmse_scores)
    print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

for i in np.arange(3, 15, 2):

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
#             print(z_scores[column])
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)
    
    # 목표 변수
    train_target = housing_train["price"].copy()
    
    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", 'price'], axis=1)

    # 특성 공학
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
    housing_train['location'] = housing_train['lat'] + housing_train['long']
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    ordinal_encoder = OrdinalEncoder()
    housing_train['date'] = pd.to_datetime(housing_train['date'])
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["tr_day_name"] = housing_train["date"].dt.day_name
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
    housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
    housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
    scaler = StandardScaler()
    filtered_train_data = scaler.fit_transform(filtered_train_data)
        
    # XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
    xgb_overfit = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=1000,          # 트리의 개수 (나무의 수)
        learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
        max_depth=10,               # 트리의 최대 깊이
        min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
        gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
        subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
        colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
    )
    
    # Cross-validation 결과 출력
    xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

    # Display results
    print("=======================================================================================")
    print(f"임계값: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_overfit_rmse_scores)
    print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

for i in np.arange(3, 20, 2):

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
    #             print(z_scores[column])
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)

    # 목표 변수
    train_target = housing_train["price"].copy()

    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", 'price'], axis=1)

    # 특성 공학
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
    housing_train['location'] = housing_train['lat'] + housing_train['long']
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    ordinal_encoder = OrdinalEncoder()
    housing_train['date'] = pd.to_datetime(housing_train['date'])
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["tr_day_name"] = housing_train["date"].dt.day_name
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
    housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
    housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
    scaler = StandardScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)  # DataFrame으로 다시 변환

    # XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
    xgb_overfit = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=1000,          # 트리의 개수 (나무의 수)
        learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
        max_depth=10,               # 트리의 최대 깊이
        min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
        gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
        subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
        colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
    )

    # 모델 훈련
    xgb_overfit.fit(filtered_train_data, train_target)

    # 특성 중요도 얻기
    feature_importances = xgb_overfit.feature_importances_

    # 중요도를 기준으로 특성의 순위를 매김
    feature_importance_ranking = pd.Series(feature_importances, index=filtered_train_data.columns).sort_values(ascending=False)

    # 특성 중요도 시각화
    plt.figure(figsize=(12, 6))
    feature_importance_ranking.plot(kind='barh')
    plt.title("Feature Importance Ranking")
    plt.xlabel("Feature Importance")
    plt.ylabel("Features")
    plt.show()

    # Cross-validation 결과 출력
    xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
    xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

    # Display results
    print("=======================================================================================")
    print(f"임계값: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_overfit_rmse_scores)
    print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
#             print(z_scores[column])
        outliers = z_scores[column] > 7
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 목표 변수
train_target = housing_train["price"].copy()

# 불필요한 열 제거
housing_train = housing_train.drop(["id", 'price'], axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 
                                          'date', 'bedrooms', 'tr_year', 'bathrooms', 'floors', 'tr_month', 'sqft_lot', 'sqft_basement',
                                         'condition', 'indoor_to_outdoor_ratio', 'sqft_above', 'sqft_lot15', 'basement_to_living_ratio',
                                         'bedrooms_to_bathrooms_ratio', 'living_to_floors_ratio', 'sqft_living', 'sqft_living15', 'house_age',
                                         'yr_built', 'zipcode'], axis=1)
scaler = StandardScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)  # DataFrame으로 다시 변환

# XGBoost 모델 초기화 (과적합을 유도하기 위한 파라미터 설정)
xgb_overfit = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=1000,          # 트리의 개수 (나무의 수)
    learning_rate=0.01,         # 학습률 (나무의 기여 정도를 조절)
    max_depth=10,               # 트리의 최대 깊이
    min_child_weight=1,         # 리프 노드에 필요한 최소 샘플 수
    gamma=0.0,                  # 나무의 가지치기를 위한 최소 손실 감소 값
    subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
    colsample_bytree=0.8        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
)

# 모델 훈련
xgb_overfit.fit(filtered_train_data, train_target)

# 특성 중요도 얻기
feature_importances = xgb_overfit.feature_importances_

# 중요도를 기준으로 특성의 순위를 매김
feature_importance_ranking = pd.Series(feature_importances, index=filtered_train_data.columns).sort_values(ascending=False)

# 특성 중요도 시각화
plt.figure(figsize=(12, 6))
feature_importance_ranking.plot(kind='barh')
plt.title("Feature Importance Ranking")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# Cross-validation 결과 출력
xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

# Display results
print("=======================================================================================")
print(f"임계값: 7")
# Cross-validation 결과 출력
display_scores(xgb_overfit_rmse_scores)
print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
#             print(z_scores[column])
        outliers = z_scores[column] > 7
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 목표 변수
train_target = housing_train["price"].copy()

# 불필요한 열 제거
housing_train = housing_train.drop(["id", 'price'], axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
scaler = StandardScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)  # DataFrame으로 다시 변환

# XGBoost 모델 초기화
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,           # 트리의 개수 (나무의 수)
    learning_rate=0.1,          # 학습률 (나무의 기여 정도를 조절)
    max_depth=None,             # 트리의 최대 깊이
    min_child_weight=10,        # 리프 노드에 필요한 최소 샘플 수
    gamma=0,                    # 나무의 가지치기를 위한 최소 손실 감소 값
    subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
    colsample_bytree=0.5        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
)

# 모델 훈련
xgb_custom_params.fit(filtered_train_data, train_target)

# 특성 중요도 얻기
feature_importances_custom = xgb_custom_params.feature_importances_

# 중요도를 기준으로 특성의 순위를 매김
feature_importance_ranking_custom = pd.Series(feature_importances_custom, index=filtered_train_data.columns).sort_values(ascending=False)

# 특성 중요도 시각화
plt.figure(figsize=(12, 6))
feature_importance_ranking_custom.plot(kind='barh')
plt.title("Custom XGBoost Feature Importance Ranking")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# Cross-validation 결과 출력
xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

# Display results
print("=======================================================================================")
print(f"임계값: 7")
# Cross-validation 결과 출력
display_scores(xgb_overfit_rmse_scores)
print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
#             print(z_scores[column])
        outliers = z_scores[column] > 7
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 목표 변수
train_target = housing_train["price"].copy()

# 불필요한 열 제거
housing_train = housing_train.drop(["id", 'price'], axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation'], axis=1)
scaler = StandardScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)  # DataFrame으로 다시 변환

# XGBoost 모델 초기화
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,           # 트리의 개수 (나무의 수)
    learning_rate=0.1,          # 학습률 (나무의 기여 정도를 조절)
    max_depth=None,             # 트리의 최대 깊이
    min_child_weight=10,        # 리프 노드에 필요한 최소 샘플 수
    gamma=0,                    # 나무의 가지치기를 위한 최소 손실 감소 값
    subsample=0.8,              # 각 트리에 사용될 훈련 데이터의 일부 (1.0은 전체 데이터 사용)
    colsample_bytree=0.5        # 각 트리에 사용될 특성의 일부 (1.0은 전체 특성 사용)
)

# 모델 훈련
xgb_custom_params.fit(filtered_train_data, train_target)

# 특성 중요도 얻기
feature_importances_custom = xgb_custom_params.feature_importances_

# 중요도를 기준으로 특성의 순위를 매김
feature_importance_ranking_custom = pd.Series(feature_importances_custom, index=filtered_train_data.columns).sort_values(ascending=False)

# 특성 중요도 시각화
plt.figure(figsize=(12, 6))
feature_importance_ranking_custom.plot(kind='barh')
plt.title("Custom XGBoost Feature Importance Ranking")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# Cross-validation 결과 출력
xgb_overfit_scores = cross_val_score(xgb_overfit, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_overfit_rmse_scores = np.sqrt(-xgb_overfit_scores)

# Display results
print("=======================================================================================")
print(f"임계값: 7")
# Cross-validation 결과 출력
display_scores(xgb_overfit_rmse_scores)
print("=======================================================================================\n")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 3
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=None,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 7
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=None,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화
xgb = XGBRegressor(objective='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델
best_xgb_model = grid_search.best_estimator_
print(f"best_xgb_model : {best_xgb_model}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=None,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화
xgb = XGBRegressor(objective='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [500],  # 과적합을 유도하기 위해 트리 수를 늘림
    'learning_rate': [0.01],  # 낮은 학습률 사용
    'max_depth': [5],  # 트리의 최대 깊이
    'min_child_weight': [10],  # 리프 노드에 필요한 최소 샘플 수
    'gamma': [0],  # 가지치기를 위한 최소 손실 감소 값
    'colsample_bytree': [0.5]  # 특성 샘플링 비율
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델과 파라미터 출력
best_xgb_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_
print(f"Best XGB Model: {best_xgb_model}")
print(f"Best Parameters: {best_parameters}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화
xgb = XGBRegressor(objective='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [500, 600, 700],  # 과적합을 유도하기 위해 트리 수를 늘림
    'learning_rate': [0.01, 0.05, 0.003, 0.005],  # 낮은 학습률 사용
    'max_depth': [None, 5, 10],  # 트리의 최대 깊이
    'min_child_weight': [5, 10, 20],  # 리프 노드에 필요한 최소 샘플 수
    'gamma': [0],  # 가지치기를 위한 최소 손실 감소 값
    'colsample_bytree': [0.5, 0.3, 0.7]  # 특성 샘플링 비율
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델과 파라미터 출력
best_xgb_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_
print(f"Best XGB Model: {best_xgb_model}")
print(f"Best Parameters: {best_parameters}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=None,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

xgb_custom_params.fit(filtered_train_data, train_target)

# 특성 중요도 얻기
feature_importances_custom = xgb_custom_params.feature_importances_

# 중요도를 기준으로 특성의 순위를 매김
feature_importance_ranking_custom = pd.Series(feature_importances_custom, index=filtered_train_data.columns).sort_values(ascending=False)

# 특성 중요도 시각화
plt.figure(figsize=(12, 6))
feature_importance_ranking_custom.plot(kind='barh')
plt.title("Custom XGBoost Feature Importance Ranking")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=700,
    learning_rate=0.01,
    max_depth=10,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

xgb_custom_params.fit(filtered_train_data, train_target)

# 특성 중요도 얻기
feature_importances_custom = xgb_custom_params.feature_importances_

# 중요도를 기준으로 특성의 순위를 매김
feature_importance_ranking_custom = pd.Series(feature_importances_custom, index=filtered_train_data.columns).sort_values(ascending=False)

# 특성 중요도 시각화
plt.figure(figsize=(12, 6))
feature_importance_ranking_custom.plot(kind='barh')
plt.title("Custom XGBoost Feature Importance Ranking")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
# print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=700,
    learning_rate=0.01,
    max_depth=10,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

xgb_custom_params.fit(filtered_train_data, train_target)

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
# print(f"train_traget: {train_target}")

# XGBoost 모델 초기화
xgb = XGBRegressor(objective='reg:squarederror', random_state=0)

# 그리드 서치를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [700, 800, 100, 1500],  # 과적합을 유도하기 위해 트리 수를 늘림
    'learning_rate': [0.0001, 0.01, 0.003, 0.005],  # 낮은 학습률 사용
    'max_depth': [None, 10, 30, 50, 100],  # 트리의 최대 깊이
    'min_child_weight': [1, 5, 10, 20],  # 리프 노드에 필요한 최소 샘플 수
    'gamma': [0],  # 가지치기를 위한 최소 손실 감소 값
    'colsample_bytree': [0.5, 0.3, 0.7, 0.1]  # 특성 샘플링 비율
}

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(filtered_train_data, train_target)

# 최적의 모델과 파라미터 출력
best_xgb_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_
print(f"Best XGB Model: {best_xgb_model}")
print(f"Best Parameters: {best_parameters}")

# Cross-validation 결과 출력
xgb_scores = cross_val_score(best_xgb_model, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def handle_outliers_zscore(data):
    # 각 특성에 대한 z-스코어 계산
    z_scores = np.abs((data - data.mean()) / data.std())

    # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
    for column in z_scores.columns:
        outliers = z_scores[column] > 1.5
        median_value = data[column].median()
        data.loc[outliers, column] = median_value

    return data

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 이상치 처리
housing_train = handle_outliers_zscore(housing_train)

imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 스케일링
filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name', 'price'], axis=1)
scaler = MinMaxScaler()
filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

# 목표 변수
train_target = housing_train["price"].copy()
# print(f"train_traget: {train_target}")

# XGBoost 모델 초기화 (주어진 파라미터 사용)
xgb_custom_params = XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    n_estimators=900,
    learning_rate=0.011,
    max_depth=10,
    min_child_weight=10,
    gamma=0,
    colsample_bytree=0.5
)

xgb_custom_params.fit(filtered_train_data, train_target)

# Cross-validation 결과 출력
xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
xgb_rmse_scores = np.sqrt(-xgb_scores)

# Display results
print("=======================================================================================")
print(f"filtered_train_data: {filtered_train_data.shape}")
# Cross-validation 결과 출력
display_scores(xgb_rmse_scores)
print("=======================================================================================\n")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor

for i in np.arange(2, 20, 1):
    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())

    def handle_outliers_zscore(data):
        # 각 특성에 대한 z-스코어 계산
        z_scores = np.abs((data - data.mean()) / data.std())

        # 각 특성에 대해 z-스코어가 i보다 큰 값을 해당 특성의 중간값으로 대체
        for column in z_scores.columns:
            outliers = z_scores[column] > i
            median_value = data[column].median()
            data.loc[outliers, column] = median_value

        return data

    # 데이터 불러오기
    input_file = "kc_house_data.csv"
    housing_train = pd.read_csv(input_file)

    # 목표 변수
    train_target = housing_train["price"]
#     print(f"train_traget: {train_target}")

    # 불필요한 열 제거
    housing_train = housing_train.drop(["id", "price"], axis=1)

    # 특성 공학
    housing_train['house_age'] = 2023 - housing_train['yr_built']
    housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
    housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
    housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
    housing_train['location'] = housing_train['lat'] + housing_train['long']
    housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
    housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
    housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
    housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

    ordinal_encoder = OrdinalEncoder()
    housing_train['date'] = pd.to_datetime(housing_train['date'])
    housing_train["tr_year"] = housing_train["date"].dt.year
    housing_train["tr_month"] = housing_train["date"].dt.month
    housing_train["tr_day_name"] = housing_train["date"].dt.day_name
    housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
    housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
    housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
    housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
    housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
    housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
    housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

    housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 이상치 처리
    housing_train = handle_outliers_zscore(housing_train)

    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_train)
    X = imputer.transform(housing_train)
    housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

    # 스케일링
    filtered_train_data = housing_train.drop(['waterfront', 'yr_renovated', 'years_since_renovation', 'Totalviews', 'tr_day_name'], axis=1)
    scaler = MinMaxScaler()
    filtered_train_data = pd.DataFrame(scaler.fit_transform(filtered_train_data), columns=filtered_train_data.columns)

    # XGBoost 모델 초기화 (주어진 파라미터 사용)
    xgb_custom_params = XGBRegressor(
        objective='reg:squarederror',
        random_state=0,
        n_estimators=700,
        learning_rate=0.01,
        max_depth=10,
        min_child_weight=10,
        gamma=0,
        colsample_bytree=0.5
    )

    xgb_custom_params.fit(filtered_train_data, train_target)

    # 특성 중요도 얻기
    feature_importances_custom = xgb_custom_params.feature_importances_

    # 중요도를 기준으로 특성의 순위를 매김
    feature_importance_ranking_custom = pd.Series(feature_importances_custom, index=filtered_train_data.columns).sort_values(ascending=False)

    # 특성 중요도 시각화
    plt.figure(figsize=(12, 6))
    feature_importance_ranking_custom.plot(kind='barh')
    plt.title("Custom XGBoost Feature Importance Ranking")
    plt.xlabel("Feature Importance")
    plt.ylabel("Features")
    plt.show()

    # Cross-validation 결과 출력
    xgb_scores = cross_val_score(xgb_custom_params, filtered_train_data, train_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4)
    xgb_rmse_scores = np.sqrt(-xgb_scores)

    # Display results
    print("=======================================================================================")
    print(f"filtered_train_data: {filtered_train_data.shape}")
    print(f"train_target: {train_target}")
    print(f"임계값 i: {i}")
    # Cross-validation 결과 출력
    display_scores(xgb_rmse_scores)
    print("=======================================================================================\n")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 1, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 0.1, Task = 1,
                     application = 'regression',num_boost_round = 1000,learning_rate = 0.1, 
                     num_leaves = 1000 ,random_state=0)


lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 1, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 0.1, Task = 1,
                     application = 'regression',num_boost_round = 1000,learning_rate = 0.01, 
                     num_leaves = 1000 ,random_state=0)


lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# LGBMRegressor 파라미터
lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

# ExtraTreesRegressor 파라미터
extratree_params = {
    'n_estimators': 300,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': 1.0,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'bootstrap': False,
    'oob_score': False,
    'n_jobs': -1,
    'random_state': 0,
    'verbose': 0,
    'warm_start': False,
    'ccp_alpha': 0.0,
    'max_samples': None
}

# VotingRegressor 설정
voting_regressor = VotingRegressor(estimators=[
    ('lgbm', lgbm),
    ('extratree', ExtraTreesRegressor(**extratree_params))
])

# VotingRegressor 평가
voting_scores = cross_val_score(voting_regressor, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=2)
voting_rmse_scores = np.sqrt(-voting_scores)

# 결과 출력
print("=======================================================================================")
display_scores(voting_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# LGBMRegressor 파라미터
lgbm_params = {'max_depth' : None, 'min_data_in_leaf': 20, 'feature_fraction': 1.0, 'n_estimators': 100,
             'bagging_fraction': 0.1, 'min_gain_to_split': 1, 'Task': 1,
             'application': 'regression','num_boost_round': 790,'learning_rate': 0.1155, 
             'num_leaves': 31 ,'random_state': 0}

# RandomForestRegressor 파라미터
randomforest_params = {'n_estimators': 300, 'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2,
                       'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 1.0, 'max_leaf_nodes': None,
                       'min_impurity_decrease': 0.0, 'bootstrap': True, 'oob_score': False, 'n_jobs': -1, 'random_state': 0,
                       'verbose': 0, 'warm_start': False, 'ccp_alpha': 0.0, 'max_samples': None
}

# VotingRegressor 설정
voting_regressor_rf = VotingRegressor(estimators=[
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('randomforest', RandomForestRegressor(**randomforest_params))
])

# VotingRegressor 평가
voting_scores_rf = cross_val_score(voting_regressor_rf, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
voting_rmse_scores_rf = np.sqrt(-voting_scores_rf)

# 결과 출력
print("=======================================================================================")
display_scores(voting_rmse_scores_rf)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# LGBMRegressor 그리드 서치 대상 파라미터
lgbm_param_grid_overfit = {
    'max_depth': [None, 200, 500, 700],
    'num_leaves': [45, 50, 55, 200],
    'learning_rate': [0.1, 0.05, 0.03, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# ExtraTreesRegressor 그리드 서치 대상 파라미터
extratree_param_grid_overfit = {
    'n_estimators': [500, 800, 1000, 1200],
    'max_depth': [None, 50, 100, 200],
    'min_samples_split': [2, 10, 30],
    'min_samples_leaf': [1, 5, 10]
}

# LGBMRegressor 그리드 서치 수행
lgbm_grid_search_overfit = GridSearchCV(LGBMRegressor(), lgbm_param_grid_overfit, cv=10, scoring='neg_mean_squared_error', n_jobs=4, verbose=2)
lgbm_grid_search_overfit.fit(sh_t_data, sh_t_target)

# ExtraTreesRegressor 그리드 서치 수행
extratree_grid_search_overfit = GridSearchCV(ExtraTreesRegressor(), extratree_param_grid_overfit, cv=10, scoring='neg_mean_squared_error', n_jobs=4, verbose=2)
extratree_grid_search_overfit.fit(sh_t_data, sh_t_target)

# LGBMRegressor와 ExtraTreesRegressor의 최적 모델을 가져와 VotingRegressor 생성
best_lgbm = lgbm_grid_search_overfit.best_estimator_
best_extratree = extratree_grid_search_overfit.best_estimator_

voting_regressor = VotingRegressor(estimators=[('lgbm', best_lgbm), ('extratree', best_extratree)])

# VotingRegressor 평가
voting_scores = cross_val_score(voting_regressor, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=4, verbose=2)
voting_rmse_scores = np.sqrt(-voting_scores)

# 결과 출력
print("---------------------------------------------------------------------------------------------------------------")
print("VotingRegressor Best Parameters for Overfitting:")
print("LGBMRegressor:", lgbm_grid_search_overfit.best_params_)
print("ExtraTreesRegressor:", extratree_grid_search_overfit.best_params_)
print("---------------------------------------------------------------------------------------------------------------")

print("=======================================================================================")
display_scores(voting_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)
ordinal_encoder = OrdinalEncoder() #각 범주를 대응하는 숫자로 변환

train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
# housing_train['house_age'] = 2023 - housing_train['yr_built']
# housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
# housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']

# housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']

housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']

housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

housing_train = housing_train.drop("date", axis=1)

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy="median") #클래스를 중간값 채우기로 설정
imputer.fit(housing_train)

X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns = housing_train.columns, index=housing_train.index)

filtered_train_data = housing_train
filtered_train_data = filtered_train_data.drop("price", axis=1)

scaler = MinMaxScaler()
scalers = scaler.fit_transform(filtered_train_data)
filtered_train_data = pd.DataFrame(scalers, columns=filtered_train_data.columns)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155, 
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

# 데이터 불러오기
input_file = "kc_house_data.csv"
housing_train = pd.read_csv(input_file)

# 불필요한 열 제거
train_target = housing_train["price"]
housing_train = housing_train.drop("id", axis=1)

# 특성 공학
housing_train['1'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['house_age'] = 2023 - housing_train['yr_built']
housing_train['years_since_renovation'] = 2023 - housing_train['yr_renovated']
housing_train['indoor_to_outdoor_ratio'] = housing_train['sqft_living'] / housing_train['sqft_lot']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['bedrooms_to_bathrooms_ratio'] = housing_train['bedrooms'] / housing_train['bathrooms']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']

housing_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# 결측치 처리
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_train)
X = imputer.transform(housing_train)
housing_train = pd.DataFrame(X, columns=housing_train.columns, index=housing_train.index)

# 데이터 스케일링
scaler = MinMaxScaler()
scalers = scaler.fit_transform(housing_train.drop("price", axis=1))
filtered_train_data = pd.DataFrame(scalers, columns=housing_train.columns[1:])

# 데이터 섞기
seed = 21345
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# LightGBM 모델 초기화
lgbm = LGBMRegressor(
    max_depth=None, min_data_in_leaf=20, feature_fraction=1.0, n_estimators=100,
    bagging_fraction=0.1, min_gain_to_split=1, Task=1,
    application='regression', num_boost_round=790, learning_rate=0.1155,
    num_leaves=31, random_state=0
)

# 교차 검증 수행
lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

# 결과 출력
print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

housing_train = pd.read_csv('kc_house_data.csv')
tt = pd.read_csv('kc_house_data.csv')

housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

scaler = StandardScaler()
scaler.fit_transform(housing_train)

housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])
filtered_train_data = housing_train.drop(['(waterfront//waterfront/sqft_living)', '(waterfront/waterfront/sqft_living)', '(sqft_living**2)//(waterfront/waterfront/sqft_living)', '(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', '(sqft_living//waterfront/sqft_living)'], axis=1)
filtered_train_data = filtered_train_data.drop(['price', 'tr_year'], axis=1)
filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

seed = 21345
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth=None, min_data_in_leaf=20, feature_fraction=1.0, n_estimators=100,
                     bagging_fraction=0.1, min_gain_to_split=1, Task=1,
                     application='regression', num_boost_round=790, learning_rate=0.1155,
                     num_leaves=31, random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

scaler = StandardScaler()
scaler.fit_transform(housing_train)

housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train["sqft_living//waterfront"] = housing_train["sqft_living"] // housing_train["waterfront"]
housing_train["waterfront/sqft_living"] = housing_train["waterfront"] // housing_train["sqft_living"]
housing_train['3/(waterfront/sqft_living)'] = 3/housing_train["waterfront/sqft_living"]
housing_train['2/(waterfront/sqft_living)'] = 2/housing_train["waterfront/sqft_living"]
housing_train['1/(waterfront/sqft_living)'] = 1/housing_train["waterfront/sqft_living"]
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])#(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living)) ## 89.7
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data = filtered_train_data.drop('tr_year', axis=1)

filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# # k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

lgbm = LGBMRegressor(max_depth = None, min_data_in_leaf = 20, feature_fraction = 1.0, n_estimators=100,
                     bagging_fraction = 0.1, min_gain_to_split = 1, Task = 1,
                     application = 'regression',num_boost_round = 790,learning_rate = 0.1155,
                     num_leaves = 31 ,random_state=0)

lgbm_scores = cross_val_score(lgbm, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv = 10, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-lgbm_scores)

print("=======================================================================================")
display_scores(lgbm_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

scaler = StandardScaler()
scaler.fit_transform(housing_train)

housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train["sqft_living//waterfront"] = housing_train["sqft_living"] // housing_train["waterfront"]
housing_train["waterfront/sqft_living"] = housing_train["waterfront"] // housing_train["sqft_living"]
housing_train['3/(waterfront/sqft_living)'] = 3/housing_train["waterfront/sqft_living"]
housing_train['2/(waterfront/sqft_living)'] = 2/housing_train["waterfront/sqft_living"]
housing_train['1/(waterfront/sqft_living)'] = 1/housing_train["waterfront/sqft_living"]
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])#(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living)) ## 89.7
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data = filtered_train_data.drop('tr_year', axis=1)

filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# # k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21345

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

catboost = CatBoostRegressor(depth=7, learning_rate=0.1, n_estimators=1100, verbose=0)

catboost_scores = cross_val_score(catboost, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
catboost_rmse_scores = np.sqrt(-catboost_scores)

print("=======================================================================================")
display_scores(catboost_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

scaler = StandardScaler()
scaler.fit_transform(housing_train)

housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train["sqft_living//waterfront"] = housing_train["sqft_living"] // housing_train["waterfront"]
housing_train["waterfront/sqft_living"] = housing_train["waterfront"] // housing_train["sqft_living"]
housing_train['3/(waterfront/sqft_living)'] = 3/housing_train["waterfront/sqft_living"]
housing_train['2/(waterfront/sqft_living)'] = 2/housing_train["waterfront/sqft_living"]
housing_train['1/(waterfront/sqft_living)'] = 1/housing_train["waterfront/sqft_living"]
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])#(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living)) ## 89.7
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data = filtered_train_data.drop('tr_year', axis=1)

filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# # k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21378
# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# CatBoostRegressor 초기화
catboost = CatBoostRegressor(depth=7, learning_rate=0.1, verbose=0)

# 교차 검증 수행
best_model_scores = cross_val_score(catboost, sh_t_data, sh_t_target, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
best_model_rmse_scores = np.sqrt(-best_model_scores)

# 결과 출력
print("=======================================================================================")
# print(f"{i}")
display_scores(best_model_rmse_scores)
print("=======================================================================================")


import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

scaler = StandardScaler()
scaler.fit_transform(housing_train)

housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train["sqft_living//waterfront"] = housing_train["sqft_living"] // housing_train["waterfront"]
housing_train["waterfront/sqft_living"] = housing_train["waterfront"] // housing_train["sqft_living"]
housing_train['3/(waterfront/sqft_living)'] = 3/housing_train["waterfront/sqft_living"]
housing_train['2/(waterfront/sqft_living)'] = 2/housing_train["waterfront/sqft_living"]
housing_train['1/(waterfront/sqft_living)'] = 1/housing_train["waterfront/sqft_living"]
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])#(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living)) ## 89.7
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data = filtered_train_data.drop('tr_year', axis=1)

filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# # k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21378

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

catboost = CatBoostRegressor(depth=7, learning_rate=0.1, n_estimators=1100, verbose=0)

catboost_scores = cross_val_score(catboost, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
catboost_rmse_scores = np.sqrt(-catboost_scores)

print("=======================================================================================")
display_scores(catboost_rmse_scores)
print("=======================================================================================")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

# 데이터 불러오기
housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

# OrdinalEncoder를 사용하여 범주형 변수를 숫자로 변환
ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

# StandardScaler를 사용하여 수치형 변수를 표준화
scaler = StandardScaler()
scaler.fit_transform(housing_train)

# 특성 엔지니어링
housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])  # (sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

# 불필요한 열 삭제
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

# 목표 변수를 제외한 특성 데이터 생성
filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21366

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# CatBoost 모델 초기화
catboost = CatBoostRegressor(depth=7, learning_rate=0.1, n_estimators=1100, verbose=0)

# 교차 검증 수행
catboost_scores = cross_val_score(catboost, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
catboost_rmse_scores = np.sqrt(-catboost_scores)

print("=======================================================================================")
display_scores(catboost_rmse_scores)
print("=======================================================================================")


#

# 결과

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

# 데이터 불러오기
housing_train = pd.read_csv('kc_house_data.csv')
housing_train = housing_train.drop("id", axis=1)
train_target = housing_train["price"].copy()

# OrdinalEncoder를 사용하여 범주형 변수를 숫자로 변환
ordinal_encoder = OrdinalEncoder()
housing_train['date'] = pd.to_datetime(housing_train['date'])
housing_train["tr_year"] = housing_train["date"].dt.year
housing_train["tr_month"] = housing_train["date"].dt.month
housing_train["tr_day_name"] = housing_train["date"].dt.day_name
housing_train["date"] = housing_train["date"].dt.strftime('%Y-%m')
housing_train["tr_year"] = ordinal_encoder.fit_transform(housing_train[["tr_year"]])
housing_train["tr_month"] = ordinal_encoder.fit_transform(housing_train[["tr_month"]])
housing_train["tr_day_name"] = ordinal_encoder.fit_transform(housing_train[["tr_day_name"]])
housing_train["date"] = ordinal_encoder.fit_transform(housing_train[["date"]])

# StandardScaler를 사용하여 수치형 변수를 표준화
scaler = StandardScaler()
scaler.fit_transform(housing_train)

# 특성 엔지니어링
housing_train['sqft_living+bathrooms'] = housing_train['sqft_living'] + housing_train['bathrooms']
housing_train['basement_to_living_ratio'] = housing_train['sqft_basement'] / housing_train['sqft_living']
housing_train['location'] = housing_train['lat'] + housing_train['long']
housing_train['living_to_floors_ratio'] = housing_train['sqft_living'] / housing_train['floors']
housing_train['Totalviews'] = housing_train['view'] + housing_train['waterfront']
housing_train['Totalviews//'] = housing_train['view'] // housing_train['waterfront']
housing_train['Toalsizes'] = housing_train['sqft_living'] + housing_train['sqft_above'] + housing_train['sqft_basement'] + housing_train['sqft_living15']
housing_train["house_per_sqft_livings"] = 1 / (housing_train["sqft_living"] + housing_train["sqft_living15"])
housing_train['(sqft_living*grade)/2'] = (housing_train['sqft_living'] * housing_train['grade']) / 2
housing_train['(sqft_living//waterfront/sqft_living)'] = ((housing_train['sqft_living']))//(housing_train['waterfront']/housing_train['sqft_living'])
housing_train["sqft_living*waterfront"] = housing_train["sqft_living"] * housing_train["waterfront"]
housing_train['(waterfront//waterfront/sqft_living)'] = (housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living']))
housing_train['((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = housing_train['(sqft_living//waterfront/sqft_living)']//housing_train['(waterfront//waterfront/sqft_living)']
housing_train['(waterfront/waterfront/sqft_living)'] = (housing_train['waterfront']/housing_train['waterfront']/housing_train['sqft_living'])
housing_train['(sqft_living**2)//(waterfront/waterfront/sqft_living)'] = (housing_train['sqft_living']**2)//(housing_train['(waterfront/waterfront/sqft_living)'])
housing_train['(waterfront//(waterfront/sqft_living))'] = housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])  # (sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))
housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))'] = ((housing_train['sqft_living']) // ((housing_train['(sqft_living//waterfront/sqft_living)'])) // (housing_train['(waterfront//waterfront/sqft_living)']))
housing_train['(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))'] = housing_train['(waterfront//waterfront/sqft_living)'] / housing_train['(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))']
housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'] = ((housing_train['sqft_living']**2)//(housing_train['waterfront']//(housing_train['waterfront']/housing_train['sqft_living'])))
housing_train['(((waterfront//waterfront/sqft_living))/((((waterfront//waterfront/sqft_living))//(((sqft_living**2)//(waterfront//(waterfront/sqft_living)))))))']= (((housing_train['(waterfront//waterfront/sqft_living)'])/((((housing_train['(waterfront//waterfront/sqft_living)']))//(housing_train['((sqft_living**2)//(waterfront//(waterfront/sqft_living)))'])))))

# 불필요한 열 삭제
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living**2)//(waterfront/waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//waterfront/sqft_living/(sqft_living//((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))))', axis=1)
housing_train = housing_train.drop('(sqft_living//waterfront/sqft_living)', axis=1)
housing_train = housing_train.drop('((sqft_living//waterfront/sqft_living)//(waterfront//waterfront/sqft_living))', axis=1)
housing_train = housing_train.drop('(waterfront//(waterfront/sqft_living))', axis=1)

# 목표 변수를 제외한 특성 데이터 생성
filtered_train_data = housing_train.drop('price', axis=1)
filtered_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# k-폴드 CV의 평균을 계산하기 위한 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# 기본 시드 설정
seed = 21366

# 데이터 섞기
np.random.seed(seed)
sh_in = np.random.permutation(len(filtered_train_data))
sh_t_data = filtered_train_data.values[sh_in]
sh_t_target = train_target.values[sh_in]

# CatBoost 모델 초기화
catboost = CatBoostRegressor(depth=7, learning_rate=0.1, n_estimators=2000, verbose=0)

# 교차 검증 수행
catboost_scores = cross_val_score(catboost, sh_t_data, sh_t_target, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
catboost_rmse_scores = np.sqrt(-catboost_scores)

print("=======================================================================================")
display_scores(catboost_rmse_scores)
print("=======================================================================================")


Scores: [122348.05952307  94499.35636369 117007.53506535  83628.8717908
 105830.95531101 121477.34595344 105426.34835928  96323.97220175
 106788.33147607 111901.67908371]
Mean: 106523.24551281861
Standard deviation: 11778.802518379614
