In [57]:
import numpy as np 
import pandas as pd 
import os
from category_encoders import *
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [58]:
DATA_DIR = '../input/house-prices-advanced-regression-techniques/'

In [59]:
train_df = pd.read_csv(DATA_DIR+'train.csv', index_col='Id')
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [60]:
train_df.shape

(1460, 80)

In [61]:
test_df = pd.read_csv(DATA_DIR+'test.csv', index_col='Id')
test_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


# Encode categorical and numerical features

In [62]:
train_df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Categorical columns

In [63]:
cat_cols = train_df.select_dtypes(exclude=["number","bool_"]).columns.to_list()
cat_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

## Numerical columns

In [64]:
num_cols = train_df.select_dtypes(exclude=["object", "category"]).columns.to_list()
num_cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [65]:
train_df.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
Heating           0
HeatingQC         0
MSZoning          0
1stFlrSF          0
SalePrice         0
Length: 80, dtype: int64

In [66]:
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

## Target Encoding

In [67]:

te = TargetEncoder(cols=cat_cols)

# transform the datasets
training_data = te.fit_transform(X, y)
training_data



Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,191004.994787,65.0,8450,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,2,2008,173401.836622,175202.219533
2,20,191004.994787,80.0,9600,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,177934.574468,...,0,0,180404.663455,187596.837998,182046.410384,0,5,2007,173401.836622,175202.219533
3,60,191004.994787,68.0,11250,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,9,2008,173401.836622,175202.219533
4,70,191004.994787,60.0,9550,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,181623.425856,...,0,0,180404.663455,187596.837998,182046.410384,0,2,2006,173401.836622,146526.623762
5,60,191004.994787,84.0,14260,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,177934.574468,...,0,0,180404.663455,187596.837998,182046.410384,0,12,2008,173401.836622,175202.219533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,191004.994787,62.0,7917,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,8,2007,173401.836622,175202.219533
1457,20,191004.994787,85.0,13175,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,148751.089172,182046.410384,0,2,2010,173401.836622,175202.219533
1458,70,191004.994787,66.0,9042,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,178927.457627,151187.612245,2500,5,2010,173401.836622,175202.219533
1459,20,191004.994787,68.0,9717,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,4,2010,173401.836622,175202.219533


In [68]:
testing_data = te.transform(test_df)
testing_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,131558.3901,80.0,11622,181130.538514,183452.131483,164754.818378,180183.746758,180950.95682,176938.047529,...,120,0,180404.663455,148751.089172,182046.410384,0,6,2010,173401.836622,175202.219533
1462,20,191004.994787,81.0,14267,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,181623.425856,...,0,0,180404.663455,187596.837998,173485.45588,12500,6,2010,173401.836622,175202.219533
1463,60,191004.994787,74.0,13830,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,148751.089172,182046.410384,0,3,2010,173401.836622,175202.219533
1464,60,191004.994787,78.0,9978,181130.538514,183452.131483,206101.665289,180183.746758,180950.95682,176938.047529,...,0,0,180404.663455,187596.837998,182046.410384,0,6,2010,173401.836622,175202.219533
1465,120,191004.994787,43.0,5005,181130.538514,183452.131483,206101.665289,231533.94,180950.95682,176938.047529,...,144,0,180404.663455,187596.837998,182046.410384,0,1,2010,173401.836622,175202.219533


In [69]:
training_data.isnull().sum().sort_values(ascending=False)

LotFrontage      259
GarageYrBlt       81
MasVnrArea         8
KitchenAbvGr       0
GarageType         0
                ... 
ExterCond          0
ExterQual          0
MasVnrType         0
Exterior2nd        0
SaleCondition      0
Length: 79, dtype: int64

In [70]:

my_imputer = SimpleImputer()
data_with_imputed_values = my_imputer.fit_transform(training_data)

In [71]:

X_train, X_test, y_train, y_test = train_test_split(
   data_with_imputed_values, y, test_size=0.1, random_state=42)

In [72]:

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.8722099358654851

In [73]:
testing_data = my_imputer.transform(testing_data)

In [74]:

preds = reg.predict(testing_data)

In [76]:
submission = pd.read_csv(DATA_DIR+'sample_submission.csv')
submission['SalePrice'] = preds

In [77]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,113516.300514
1,1462,163944.318609
2,1463,169204.214756
3,1464,183126.629439
4,1465,219940.010707


In [79]:
submission.to_csv('submission.csv', header=True, index=False)