In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

pd.pandas.set_option('display.max_columns', None)

In [2]:
data=pd.read_csv('houseprice.csv')
data=data.drop(labels=['Id'],axis=1)

In [3]:
categorical=[i for i in data.columns if data[i].dtype=='O']

In [4]:
numerical=[i for i in data.columns if (i not in categorical)&(i!='SalePrice')]

In [5]:
year=[i for i in numerical if 'Yr' in i or 'Year' in i]

In [6]:
continous=[i for i in numerical if (data[i].nunique()>50)&(i not in year)]

In [7]:
discrete=[i for i in numerical if (data[i].nunique()<50)&(i not in year)]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data.drop(['SalePrice'], axis=1),
                                                    data['SalePrice'],
                                                    test_size=0.1,
                                                    random_state=0)

In [9]:
def year_extract(data,year):
    year_dict={}
    for i in year:
        if i != 'YrSold':
            data[i]=data['YrSold']-data[i]
    data.drop('YrSold', axis=1, inplace=True)
    return data

In [10]:
x_train=year_extract(x_train,year)
x_test=year_extract(x_test,year)

In [11]:
year.remove('YrSold')

In [12]:
from sklearn.preprocessing import MaxAbsScaler

from feature_engine.discretisation import DecisionTreeDiscretiser as DTD

from feature_engine.encoding import (
    DecisionTreeEncoder as DTE,
    RareLabelEncoder as RLE
)

from feature_engine.imputation import (
    AddMissingIndicator as AMI,
    ArbitraryNumberImputer as ANI,
    CategoricalImputer as CI
)

In [21]:
house_pipe = Pipeline([
    
    ('missing_ind', AMI(
        variables=['LotFrontage','GarageYrBlt'])),
    
    ('imputer_num', ANI(
        arbitrary_number=0,
        variables=['LotFrontage', 'MasVnrArea',  'GarageYrBlt'])),
    
    ('imputer_cat', CI(
        variables=categorical)),
    
    ('discretisation',DTD(
        random_state=1,
        variables=discrete+continous+year)),

    ('rare_label_enc', RLE(
        tol=0.008, 
        n_categories=1, 
        variables=categorical)),
    
    ('categorical_enc',DTE(
        random_state=1,
        variables= categorical)),

    ('scaler', MaxAbsScaler()),
    
    ('lasso', Lasso(random_state=0))
])

In [22]:
house_pipe.fit(x_train, y_train)

Pipeline(steps=[('missing_ind',
                 AddMissingIndicator(variables=['LotFrontage', 'GarageYrBlt'])),
                ('imputer_num',
                 ArbitraryNumberImputer(arbitrary_number=0,
                                        variables=['LotFrontage', 'MasVnrArea',
                                                   'GarageYrBlt'])),
                ('imputer_cat',
                 CategoricalImputer(variables=['MSZoning', 'Street', 'Alley',
                                               'LotShape', 'LandContour',
                                               'Utilities', 'LotConfig',
                                               'LandSlope', 'Neighborho...
                                                'LandSlope', 'Neighborhood',
                                                'Condition1', 'Condition2',
                                                'BldgType', 'HouseStyle',
                                                'RoofStyle', 'RoofMatl',
              

In [23]:
x_train_preds = house_pipe.predict(x_train)
x_test_preds = house_pipe.predict(x_test)

In [24]:
print('train mse: {}'.format(mean_squared_error(y_train, x_train_preds)))
print('train rmse: {}'.format(sqrt(mean_squared_error(y_train, x_train_preds))))
print('train r2: {}'.format(r2_score(y_train, x_train_preds)))
print()
print('test mse: {}'.format(mean_squared_error(y_test, x_test_preds)))
print('test rmse: {}'.format(sqrt(mean_squared_error(y_test, x_test_preds))))
print('test r2: {}'.format(r2_score(y_test, x_test_preds)))

train mse: 566250486.6119391
train rmse: 23796.01829323425
train r2: 0.9093104206190826

test mse: 880317212.7177948
test rmse: 29670.140085914572
test r2: 0.8719001099373364
