In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Data Load & Simple EDA

In [None]:
import pandas as pd

In [None]:
X_train.shape, X_test.shape

In [None]:
pd.set_option("display.max_columns", 100)
display(X_train.head(3))
display(X_test.head(3))

In [None]:
y_train['SalePrice'].hist()

In [None]:
y_test['SalePrice'].hist()

In [None]:
X_train.isnull().sum().sort_values(ascending=False)[:20]

In [None]:
X_test.isnull().sum().sort_values(ascending=False)[:20]

In [None]:
X_train.info()

# Preprocessing

In [None]:
X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])
target = y_train['SalePrice']

In [None]:
# id값 옮겨놓기
X_train_id = X_train.pop('Id')
X_test_id = X_test.pop('Id')

In [None]:
X_train.head(1)

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, target, test_size=0.15, random_state=2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

# Model

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X_tr, y_tr, verbose=False)
pred = model.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

print("RMSLE : " + str(rmsle(y_val, pred)))

# Simple Preprocessing

In [None]:
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

idx1 = y_train['SalePrice'].quantile(0.005)>y_train['SalePrice']
idx2 = y_train['SalePrice'].quantile(0.995)<y_train['SalePrice']

y_train = y_train[~(idx1 + idx2)]
X_train = X_train[~(idx1 + idx2)]

X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])
target = y_train['SalePrice']
X_train_id = X_train.pop('Id')
X_test_id = X_test.pop('Id')

imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, target, test_size=0.15, random_state=20222)

model = XGBRegressor()
model.fit(X_tr, y_tr)
pred = model.predict(X_val)

print("RMSLE : " + str(rmsle(y_val, pred)))

## Simple Tuning

In [None]:
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

idx1 = y_train['SalePrice'].quantile(0.005)>y_train['SalePrice']
idx2 = y_train['SalePrice'].quantile(0.995)<y_train['SalePrice']

y_train = y_train[~(idx1 + idx2)]
X_train = X_train[~(idx1 + idx2)]

X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])
target = y_train['SalePrice']
X_train_id = X_train.pop('Id')
X_test_id = X_test.pop('Id')

imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, target, test_size=0.15, random_state=20222)

model = XGBRegressor(n_estimators=100, max_depth=4, colsample_bytree=0.9)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)

print("RMSLE : " + str(rmsle(y_val, pred)))

# Predict & to CSV

In [None]:
pred = model.predict(X_test)
output = pd.DataFrame({'Id': X_test_id, 'SalePrice': pred})
output.head()

In [None]:
# csv생성
output.to_csv("000000.csv", index=False)
output.head(3)

# 결과 체점

In [None]:
pred = model.predict(X_test)
print("RMSLE : " + str(rmsle(y_test['SalePrice'], pred)))