In [2]:
import pandas as pd

df_train = pd.read_csv('train.csv', index_col='Unnamed: 0')
df_test = pd.read_csv('test.csv', index_col='Unnamed: 0')
y_test = pd.read_csv('dataset/sample_submission.csv')

In [3]:
df_train.shape, df_test.shape

((1460, 176), (1459, 175))

In [16]:
df_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageCond_GarageCond_Missing,GarageCond_Po,GarageCond_TA,PavedDrive_Y,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,4.094345,4.174387,9.041922,0.142857,0.2,17.0,2.833213,196.0,706.0,0.0,...,0,0,1,1,0,0,0,1,0,208500
1,2.995732,4.382027,9.169518,0.166667,0.125,44.0,3.78419,0.0,978.0,0.0,...,0,0,1,1,0,0,0,1,0,181500
2,4.094345,4.219508,9.328123,0.142857,0.2,19.0,2.890372,162.0,486.0,0.0,...,0,0,1,1,0,0,0,1,0,223500
3,4.248495,4.094345,9.164296,0.142857,0.2,105.0,3.912023,0.0,216.0,0.0,...,0,0,1,1,0,0,0,0,0,140000
4,4.094345,4.430817,9.565214,0.125,0.2,20.0,2.995732,350.0,655.0,0.0,...,0,0,1,1,0,0,0,1,0,250000


In [17]:
df_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageQual_TA,GarageCond_GarageCond_Missing,GarageCond_Po,GarageCond_TA,PavedDrive_Y,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1460,2.995732,4.382027,9.360655,0.2,0.166667,59.0,4.077537,0.0,468.0,144.0,...,1,0,0,1,1,0,0,0,1,0
1461,2.995732,4.394449,9.565704,0.166667,0.166667,62.0,4.127134,108.0,923.0,0.0,...,1,0,0,1,1,0,0,0,1,0
1462,4.094345,4.304065,9.534595,0.2,0.2,23.0,3.091042,0.0,791.0,0.0,...,1,0,0,1,1,0,0,0,1,0
1463,4.094345,4.356709,9.208138,0.166667,0.166667,22.0,3.091042,20.0,602.0,0.0,...,1,0,0,1,1,0,0,0,1,0
1464,4.787492,3.7612,8.518193,0.125,0.2,28.0,3.332205,0.0,263.0,0.0,...,1,0,0,1,1,0,0,0,1,0


In [18]:
y_test.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [19]:
# separate dependent & independent features
x = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]

In [20]:
x_train = pd.concat([x, df_test], axis=0)
y_train = pd.concat([y, y_test.SalePrice], axis=0)

In [21]:
x_train.shape, y_train.shape

((2919, 175), (2919,))

In [None]:
param = {'n_estimators': 200,
 'min_child_weight': 3,
 'max_depth': 5,
 'gamma': 0.4,
 'colsample_bytree': 1.0,
 'booster': 'gbtree'}

## Train & Test the data

In [22]:
from xgboost import XGBRegressor

classifier = XGBRegressor(n_estimators=200, min_child_weight=3, max_depth=5, gamma=0.4, 
                          colsample_bytree=1.0, booster='gbtree')

In [23]:
classifier.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0.4, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
y_pred = classifier.predict(df_test)

## Check for accuracy

In [25]:
from sklearn.metrics import r2_score

In [26]:
r2_score(y_true=y_test.SalePrice, y_pred=y_pred)

0.8927703916928722

## Create the final submission file

In [27]:
final_output = pd.concat([y_test, pd.DataFrame(y_pred)], axis=1)
final_output

Unnamed: 0,Id,SalePrice,0
0,1461,169277.052498,164087.984375
1,1462,187758.393989,185474.843750
2,1463,183583.683570,186979.375000
3,1464,179317.477511,188368.593750
4,1465,150730.079977,153003.062500
...,...,...,...
1454,2915,167081.220949,166494.984375
1455,2916,164788.778231,160560.687500
1456,2917,219222.423400,218494.781250
1457,2918,184924.279659,178622.703125


In [28]:
# SalePrice, Id

final_output.drop('SalePrice', axis=1, inplace=True)

In [29]:
final_output.columns  = ['Id', 'SalePrice']

In [30]:
final_output.head()

Unnamed: 0,Id,SalePrice
0,1461,164087.984375
1,1462,185474.84375
2,1463,186979.375
3,1464,188368.59375
4,1465,153003.0625


In [31]:
final_output.to_csv('sample_submission.csv', index=None)

In [32]:
final_output.shape

(1459, 2)