# House Prices Advanced Regression Techniques - *Linear Regression*

- Using Linear Regression
- Fill missing with data in all cases
- Work with all values
- **Target**: *SalePrice*

[Kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview)

In [2]:
# Imports
import os 

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [3]:
# Configs
sns.set_style('darkgrid')
datasource_base_path = "./../data/"

In [4]:
# Function to measure results
def results_regression(y_test_ ,y_pred_):
    mse = mean_squared_error(y_test_ ,y_pred_)
    print(f"MSE: {mse}")
    
    rmse = np.sqrt(mse)
    print(f"RMSE: {rmse}")
    
    mae = mean_absolute_error(y_test_ ,y_pred_)
    print(f"MAE: {mae}")
    
    mape = mean_absolute_percentage_error(y_test_ ,y_pred_)
    print(f"MAPE: {mape}")
    
    r2 = r2_score(y_test_ ,y_pred_)
    print(f"R2_SCORE {r2}")

In [5]:
# Loading train dataset
data = pd.read_csv(os.path.join(datasource_base_path, "train.csv"))
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Pre process

In [6]:
display('Data Type Distribution', data.dtypes.value_counts())
display('Describe Dataset', data.describe())

'Data Type Distribution'

object     43
int64      35
float64     3
dtype: int64

'Describe Dataset'

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
try:
    # dropping features with > .75 of null values
    print('Dropping features with > 75% nan values')
    cols_to_drop = []
    for col in data.columns:
        if data[col].isna().sum() > data.shape[0] * .75:
            cols_to_drop.append(col)
            print(col, data[col].isna().sum())

    data.drop(columns=cols_to_drop, axis=1, inplace=True)
    
    # Filling remaining NaNs values with mode or median
    print('Filling remaining Nan values')
    for col in data.columns:
        if data[col].isna().sum() > 0:
            # string values filling with mode
            if data[col].dtypes == 'object':
                mode_val = data[col].mode().values[0]
                data[col].fillna(mode_val, inplace=True)
                print(f'Filling {col} with mode: {mode_val}')
            # numeric values filling with median
            elif data[col].dtypes in ['float', 'int']:
                median_val = data[col].median()
                data[col].fillna(median_val, inplace=True)
                print(f'Filling {col} with median: {median_val}')

    # encoder string values
    for col in data.select_dtypes(include=['object']).columns:
        encoder = LabelEncoder()
        data[col] = encoder.fit_transform(data[col])

    # Drop unnecessay column
    data.drop(columns=['Id'], axis=1, inplace=True)
except Exception as e:
    print(e)
    pass
finally:
    display(data.head())

Dropping features with > 75% nan values
Alley 1369
PoolQC 1453
Fence 1179
MiscFeature 1406
Filling remaining Nan values
Filling LotFrontage with median: 69.0
Filling MasVnrType with mode: None
Filling MasVnrArea with median: 0.0
Filling BsmtQual with mode: TA
Filling BsmtCond with mode: TA
Filling BsmtExposure with mode: No
Filling BsmtFinType1 with mode: Unf
Filling BsmtFinType2 with mode: Unf
Filling Electrical with mode: SBrkr
Filling FireplaceQu with mode: Gd
Filling GarageType with mode: Attchd
Filling GarageYrBlt with median: 1980.0
Filling GarageFinish with mode: Unf
Filling GarageQual with mode: TA
Filling GarageCond with mode: TA


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


In [8]:
# Validate missing values
data.isna().sum().sum()

0

In [9]:
transformer = MaxAbsScaler().set_output(transform='pandas')
transformer.fit(data)
data = transformer.transform(data)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   float64
 1   MSZoning       1460 non-null   float64
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   float64
 4   Street         1460 non-null   float64
 5   LotShape       1460 non-null   float64
 6   LandContour    1460 non-null   float64
 7   Utilities      1460 non-null   float64
 8   LotConfig      1460 non-null   float64
 9   LandSlope      1460 non-null   float64
 10  Neighborhood   1460 non-null   float64
 11  Condition1     1460 non-null   float64
 12  Condition2     1460 non-null   float64
 13  BldgType       1460 non-null   float64
 14  HouseStyle     1460 non-null   float64
 15  OverallQual    1460 non-null   float64
 16  OverallCond    1460 non-null   float64
 17  YearBuilt      1460 non-null   float64
 18  YearRemo

## Train model

In [11]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [12]:
# Setting some params to test in grid
params = {"fit_intercept": [True, False], "n_jobs": [1, 2, 3, 5, 8]}
model = GridSearchCV(LinearRegression(), params)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display("Best params:", model.best_params_)
display("Best score:", model.best_score_)

'Best params:'

{'fit_intercept': False, 'n_jobs': 1}

'Best score:'

0.7210161139636911

In [13]:
results_regression(y_test, y_pred)

MSE: 0.0019076346071954006
RMSE: 0.043676476588610035
MAE: 0.029112909223084512
MAPE: 0.1324675656208106
R2_SCORE 0.8441695605964191
