# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

In [1]:
# Code you have previously used to load data
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [36]:
def save_submission(model_name, model, features):
    # read test data file using pandas
    test_data = pd.read_csv("./data/hpc_test.csv")

    # create test_X which comes from test_data but includes only the columns you used for prediction.
    # The list of columns is stored in a variable called features

    test_X = test_data[features]

    # make predictions which we will submit. 
    test_preds = model.predict(test_X)

    # The lines below shows you how to save your data in the format needed to score it in the competition
    output = pd.DataFrame({'Id': test_data.Id,
                           'SalePrice': test_preds})
    
    output_file = f'submission_{model_name}.csv'
    output.to_csv(output_file, index=False)
    
    print(f'{output_file} saved!')

In [3]:
home_data = pd.read_csv("./data/hpc_train.csv")
home_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
home_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
y = home_data.SalePrice
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

## Linear Regression

In [6]:
home_data_num = home_data.select_dtypes(include=[np.number])
home_data_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [56]:
def use_linear_regression_and_save_submission(model_type, features):
    X = home_data_num[features]
    print('X:', X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100)
    linear_regr = linear_model.LinearRegression()
    linear_regr.fit(X_train, y_train)
    print('Coefficients:', linear_regr.coef_)
    print('Intercept:', linear_regr.intercept_)

    y_predicted = linear_regr.predict(X_test)

    print(f'R2: {r2_score(y_true=y_test, y_pred=y_predicted)}')
    print(f"MSE: {mean_absolute_error(y_true=y_test, y_pred=y_predicted)}")
    
    save_submission(f'{model_type}_linear_regression', linear_regr, features)

### Simple Linear Regression

In [54]:
features = ['OverallQual']

use_linear_regression_and_save_submission(model_type='simple', features=features)

X:       OverallQual
0               7
1               6
2               7
3               7
4               8
...           ...
1455            6
1456            6
1457            7
1458            5
1459            5

[1460 rows x 1 columns]
Coefficients: [44780.7500158]
Intercept: -92550.08368564205
R2: 0.647414965422141
MSE: 32228.631737310807
submission_simple_regression.csv saved!


### Multiple Linear Regression

In [55]:
features = ['OverallQual', 
            'GrLivArea', 
            '1stFlrSF',
            'YearBuilt',
            'FullBath',
            'Fireplaces'
#             'GarageCars', 
#             'TotalBsmtSF',
#             'GarageArea'
           ]

use_linear_regression_and_save_submission(model_type='multiple', features=features)

X:       OverallQual  GrLivArea  1stFlrSF  YearBuilt  FullBath  Fireplaces
0               7       1710       856       2003         2           0
1               6       1262      1262       1976         2           1
2               7       1786       920       2001         2           1
3               7       1717       961       1915         1           1
4               8       2198      1145       2000         2           1
...           ...        ...       ...        ...       ...         ...
1455            6       1647       953       1999         2           1
1456            6       2073      2073       1978         2           2
1457            7       2340      1188       1941         2           2
1458            5       1078      1078       1950         1           0
1459            5       1256      1256       1965         1           0

[1460 rows x 6 columns]
Coefficients: [22835.19807868    47.65242266    30.13923009   506.74100878
 -4903.09018692 10232.02274835]
I

X:       OverallQual  GrLivArea  YearBuilt  FullBath
0               7       1710       2003         2
1               6       1262       1976         2
2               7       1786       2001         2
3               7       1717       1915         1
4               8       2198       2000         2
...           ...        ...        ...       ...
1455            6       1647       1999         2
1456            6       2073       1978         2
1457            7       2340       1941         2
1458            5       1078       1950         1
1459            5       1256       1965         1

[1460 rows x 4 columns]
Coefficients: [25024.85592375    65.18416724   575.1976582  -8935.99315928]
Intercept: -1190958.0887849932
R2: 0.7581413341754806
MSE: 26336.37444697725
submission_multiple_regression.csv saved!


In [None]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
#XGBoost hyper-parameter tuning
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }
    
    xgb_model = XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           #scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(X_train, y_train)

    return gsearch.best_params_

In [None]:
# Run only in the first run of the kernel
# hyperParameterTuning(X_train, y_train)

**Best params:**

{'colsample_bytree': 0.7,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 3,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [None]:
model = XGBRegressor(
        objective = 'reg:squarederror',
        colsample_bytree = 0.7,
        learning_rate = 0.1,
        max_depth = 5,
        min_child_weight = 3,
        n_estimators = 100,
        subsample = 0.5)

%time model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print(f'R2: {r2_score(y_test, y_pred)}')