# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

In [2]:
# Code you have previously used to load data
import pandas as pd
import numpy as np

!conda install -c conda-forge xgboost -y
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ira/anaconda3/envs/LevelUp_DataScience

  added / updated specs:
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    libstdcxx-ng-9.3.0         |      h2ae2ef3_17         4.0 MB  conda-forge
    libxgboost-1.3.0           |       h9c3ff4c_1         3.3 MB  conda-forge
    py-xgboost-1.3.0           |   py38h578d9bd_1         124 KB  conda-forge
    xgboost-1.3.0              |   py38h709712a_1          11 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         7.5 MB

The following NEW packages will be INSTALLED:

  _py-xgboost-mutex  conda-forge/linux-64::_py-xgboost-mutex-2.0-cpu_0
 

In [None]:
# home_data = pd.read_csv("./data/hpc_train.csv")
home_data = pd.read_csv("./hpc_train_data.csv")
home_data

In [None]:
home_data.info()

In [None]:
y = home_data.SalePrice
y

## Linear Regression

In [None]:
def save_submission_linear(model_name, model, features):
    # read test data file using pandas
    test_data = pd.read_csv("./data/hpc_test.csv")

    # create test_X which comes from test_data but includes only the columns you used for prediction.
    # The list of columns is stored in a variable called features

    test_X = test_data[features]

    # make predictions which we will submit. 
    test_preds = model.predict(test_X)

    # The lines below shows you how to save your data in the format needed to score it in the competition
    output = pd.DataFrame({'Id': test_data.Id,
                           'SalePrice': test_preds})
    
    output_file = f'submission_{model_name}.csv'
    output.to_csv(output_file, index=False)
    
    print(f'{output_file} saved!')

In [None]:
home_data_num = home_data.select_dtypes(include=[np.number])
home_data_num.info()

In [None]:
def use_modelession_and_save_submission_linear(model_type, features):
    X = home_data_num[features]
    print('X:', X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100)
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)
    print('Coefficients:', model.coef_)
    print('Intercept:', model.intercept_)

    y_predicted = model.predict(X_test)

    print(f'R2: {r2_score(y_true=y_test, y_pred=y_predicted)}')
    print(f"MSE: {mean_absolute_error(y_true=y_test, y_pred=y_predicted)}")
    
    save_submission_linear(f'{model_type}_modelession', model, features)

### Simple Linear Regression

In [None]:
features = ['OverallQual']

use_modelession_and_save_submission_linear(model_type='simple', features=features)

### Multiple Linear Regression

In [None]:
features = ['OverallQual', 
            'GrLivArea', 
            '1stFlrSF',
            'YearBuilt',
            'FullBath',
            'Fireplaces'
#             'GarageCars', 
#             'TotalBsmtSF',
#             'GarageArea'
           ]

use_modelession_and_save_submission_linear(model_type='multiple', features=features)

## Multiple Polynomial Regression

In [None]:
def save_submission_linear(model_name, model, features, poly):
    # read test data file using pandas
    test_data = pd.read_csv("./data/hpc_test.csv")

    # create test_X which comes from test_data but includes only the columns you used for prediction.
    # The list of columns is stored in a variable called features

    test_X = test_data[features]
    
    X_test_poly = poly.fit_transform(test_X)

    # make predictions which we will submit. 
    test_preds = model.predict(X_test_poly)

    # The lines below shows you how to save your data in the format needed to score it in the competition
    output = pd.DataFrame({'Id': test_data.Id,
                           'SalePrice': test_preds})
    
    output_file = f'submission_{model_name}.csv'
    output.to_csv(output_file, index=False)
    
    print(f'{output_file} saved!')

In [None]:
def use_polynomial_regression_and_save_submission_linear(features):
    X = home_data_num[features]
    print('X:', X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100)
    
    
    for degree in range(2, 10):
        poly = PolynomialFeatures(degree=degree)
        X_train_poly = poly.fit_transform(X_train)

        model = linear_model.LinearRegression()
        model.fit(X_train_poly, y_train)
#         print('Coefficients:', model.coef_)
#         print('Intercept:', model.intercept_)
        
        X_test_poly = poly.fit_transform(X_test)
        y_predicted = model.predict(X_test_poly)

        print(f'R2: {r2_score(y_true=y_test, y_pred=y_predicted)}')
        print(f"MSE: {mean_absolute_error(y_true=y_test, y_pred=y_predicted)}")

        save_submission_linear(f'polynomial-{degree}_regression', 
                               model, 
                               features, 
                               poly)
        
        print()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

features = ['OverallQual', 
            'GrLivArea', 
            '1stFlrSF',
            'YearBuilt',
            'FullBath',
            'Fireplaces'
           ]

use_polynomial_regression_and_save_submission_linear(features=features)

## XGBoost

In [None]:
# features = ['LotArea', 
#             'YearBuilt', 
#             '1stFlrSF', 
#             '2ndFlrSF', 
#             'FullBath', 
#             'BedroomAbvGr', 
#             'TotRmsAbvGrd']

features = ['OverallQual', 
            'GrLivArea', 
            '1stFlrSF',
            'YearBuilt',
            'FullBath',
            'Fireplaces'
           ]

X = home_data[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
#XGBoost hyper-parameter tuning
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [x for x in range(2, 11, 1)],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [x for x in range(100, 1000, 100)],
        'objective': ['reg:squarederror']
    }
    
    xgb_model = XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(X_train, y_train)

    return gsearch.best_params_

In [None]:
# Run only in the first run of the kernel
# hyperParameterTuning(X_train, y_train)

**Best params:**

<!-- {'colsample_bytree': 0.7,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 3,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'subsample': 0.5} -->
 
 {'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 1,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [None]:
model = XGBRegressor(
        colsample_bytree = 0.7,
        learning_rate = 0.01,
        max_depth = 5,
        min_child_weight = 1,
        n_estimators = 500,
        objective = 'reg:squarederror',
        subsample = 0.7)

%time model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

y_pred = model.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print(f'R2: {r2_score(y_test, y_pred)}')

In [None]:
def save_submission_xgboost(model_name, model, features):
    # read test data file using pandas
    test_data = pd.read_csv("./data/hpc_test.csv")

    # create test_X which comes from test_data but includes only the columns you used for prediction.
    # The list of columns is stored in a variable called features

    test_X = test_data[features]
    
    # make predictions which we will submit. 
    test_preds = model.predict(test_X)

    # The lines below shows you how to save your data in the format needed to score it in the competition
    output = pd.DataFrame({'Id': test_data.Id,
                           'SalePrice': test_preds})
    
    output_file = f'submission_{model_name}.csv'
    output.to_csv(output_file, index=False)
    
    print(f'{output_file} saved!')

In [None]:
save_submission_xgboost('xgboost', model, features)