The competition is [here](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

In [163]:
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Datasets

In [164]:
train = pd.read_csv('./datasets/boston_2/train_processed.csv')
test = pd.read_csv('./datasets/boston_2/test_processed.csv')

train.head(2)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.170838,-0.139979,7,-0.477908,2003,1.23371,0.802294,-0.359662,-0.886362,-0.134746,...,False,False,False,False,True,False,False,False,True,False
1,0.525501,0.108672,6,2.010085,1976,-0.817134,1.01776,-0.359662,-0.433331,-0.134746,...,False,False,False,False,True,False,False,False,True,False


In [165]:
print('Shape : ', train.shape, test.shape)
print('Nulls : ', train.isnull().sum().sum(), test.isnull().sum().sum())

Shape :  (1458, 239) (1459, 239)
Nulls :  0 0


# Dataset Prep

In [166]:
testId = test['Id']
y_train = train['SalePrice']

X_test = test.drop(columns=['Id'])
X_train = train.drop('SalePrice', axis=1)

print('New Shape : ', X_train.shape, y_train.shape, X_test.shape)

New Shape :  (1458, 238) (1458,) (1459, 238)


# Training

In [167]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [168]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [169]:
import xgboost as xgb

model = xgb.XGBRegressor(learning_rate=0.01,
                       n_estimators=11000,
                       max_depth=36,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006)

# model = xgb.XGBRegressor(n_estimators = 1500, learning_rate = 0.1)
model.fit(X_train, y_train)

In [170]:
y_pred = model.predict(X_val)

In [171]:
print(f"Accuracy                : {model.score(X_train, y_train)}")
print(f"Root Mean Squared Error : {np.sqrt(mean_squared_error(y_val, y_pred))}")
print(f"Mean Squared Error      : {mean_squared_error(y_val, y_pred)}")
print(f"Mean Absolute Error     : {mean_absolute_error(y_val, y_pred)}")

Accuracy                : 0.9118861298682763
Root Mean Squared Error : 0.14862863665378662
Mean Squared Error      : 0.022090471633563324
Mean Absolute Error     : 0.09831577481795961


# Predict

In [172]:
y_pred = model.predict(X_test)
y_pred = np.expm1(y_pred)

# Submission

In [173]:
submission = pd.DataFrame({
        "Id": testId,
        "SalePrice": y_pred
    })

submission.to_csv('./datasets/boston_2/submission.csv', index=False)

In [174]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,119974.476562
1,1462,159363.6875
2,1463,165427.359375
3,1464,164452.421875
4,1465,180123.265625
