# Simple notebook that predicts housing prices using a XGBoost regression

### Import packages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

### Get data

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [3]:
y = df_train['SalePrice']
x = df_train.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])

### Transform data

In [4]:
train_x, test_x, train_y, test_y = train_test_split(x.to_numpy(), y.to_numpy(), test_size=0.25)

In [5]:
my_imputer = SimpleImputer()
train_x = my_imputer.fit_transform(train_x)
test_x = my_imputer.transform(test_x)

### Fit model

In [6]:
model = XGBRegressor()
model.fit(train_x, train_y, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### Predict values

In [7]:
predictions = model.predict(test_x)

### Display Mean Absolute Error

In [8]:
print(f"Mean Absolute Error: {mean_absolute_error(predictions, test_y)}")
print(f"R2: {r2_score(predictions, test_y)}")

Mean Absolute Error: 17328.601771190068
R2: 0.8322064141497767
