In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from custom_transformer import FeatureSelector

%matplotlib inline

## Feature Selection and Model Selection

* split the data into training and validation
* Fill numerical Columns missing values with 0
* Fill categorical Columns missing values with unknown
* Do one hot encoding for categorical columns
* Use mutual_info_regression to eliminate features with zero correlation
* Use PCA to transform into components with Maximum explainability
* Run tree with grid search for optimal parameters
* Run Linear model with grid search for optimal parameters

In [47]:
data = pd.read_csv('data/train.csv')

In [48]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [49]:
na = pd.DataFrame(data.isnull().sum(),columns =['cnt'])
na[na['cnt'] > 0]

Unnamed: 0,cnt
LotFrontage,259
Alley,1369
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


In [50]:
X = data.loc[ : , data.columns != 'SalePrice']
y = data['SalePrice']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)

In [51]:
def make_pipe(model,n_components = None):

    numerical = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
           'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
           'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
           'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
           'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
           'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
           'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
           'MiscVal', 'YrSold']
    categorical = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
           'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
           'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
           'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
           'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
           'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
           'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
           'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
           'SaleType', 'SaleCondition']

    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer([
        ('cat', cat_pipe, categorical),
        ('num', num_pipe, numerical)
    ])

    pca = PCA()
    
    if n_components is not None:
        pca = PCA(n_components = n_components)
    
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selector',FeatureSelector()),
        ('pca',pca),
        ('model',model)
    ])
    return pipe

In [52]:
grid_params = {
    'pca__n_components': (25,50, 100),
    'model__n_estimators': (50,100,150),
    'model__max_depth' : (10,20,None)
}
gs = GridSearchCV(make_pipe(RandomForestRegressor(n_jobs=-1,random_state=43)), grid_params,scoring ='r2',n_jobs=-1)
gs.fit(X_train,y_train)
gs.score(X_test,y_test)

0.8397520196229571

In [53]:
gs.best_params_

{'model__max_depth': None, 'model__n_estimators': 50, 'pca__n_components': 25}

In [54]:
grid_params = {
    'pca__n_components': (25,50, 100),
    'model__alpha': (1,5,10),
}
gs2 = GridSearchCV(make_pipe(Ridge()), grid_params,scoring ='r2',n_jobs=-1)
gs2.fit(X_train,y_train)
gs2.score(X_test,y_test)

0.8314705279876093

In [55]:
gs2.best_params_

{'model__alpha': 10, 'pca__n_components': 100}

## Training with Best Model

In [56]:
pipe = make_pipe(RandomForestRegressor(n_jobs=-1,random_state=43, n_estimators = 50),n_components=25)

In [57]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='unknown',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['MSZoning', 'Street',
                                                   'Alley', 'LotShape',
                                                   'LandContour', 'Utilities',
                                                   'LotConfig', 'LandSlope',
                                         

In [58]:
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8407192146442017

In [59]:
joblib.dump(pipe, 'model.pkl', compress = 1)

['model.pkl']

In [66]:
data.head(1).to_json(orient='records')

'[{"Id":1,"MSSubClass":60,"MSZoning":"RL","LotFrontage":65.0,"LotArea":8450,"Street":"Pave","Alley":null,"LotShape":"Reg","LandContour":"Lvl","Utilities":"AllPub","LotConfig":"Inside","LandSlope":"Gtl","Neighborhood":"CollgCr","Condition1":"Norm","Condition2":"Norm","BldgType":"1Fam","HouseStyle":"2Story","OverallQual":7,"OverallCond":5,"YearBuilt":2003,"YearRemodAdd":2003,"RoofStyle":"Gable","RoofMatl":"CompShg","Exterior1st":"VinylSd","Exterior2nd":"VinylSd","MasVnrType":"BrkFace","MasVnrArea":196.0,"ExterQual":"Gd","ExterCond":"TA","Foundation":"PConc","BsmtQual":"Gd","BsmtCond":"TA","BsmtExposure":"No","BsmtFinType1":"GLQ","BsmtFinSF1":706,"BsmtFinType2":"Unf","BsmtFinSF2":0,"BsmtUnfSF":150,"TotalBsmtSF":856,"Heating":"GasA","HeatingQC":"Ex","CentralAir":"Y","Electrical":"SBrkr","1stFlrSF":856,"2ndFlrSF":854,"LowQualFinSF":0,"GrLivArea":1710,"BsmtFullBath":1,"BsmtHalfBath":0,"FullBath":2,"HalfBath":1,"BedroomAbvGr":3,"KitchenAbvGr":1,"KitchenQual":"Gd","TotRmsAbvGrd":8,"Functional"

In [69]:
pipe.predict(X_test.head(1))

array([159190.])