In [48]:
import numpy as np 
import pandas as pd 
import os
from category_encoders import *
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [49]:
DATA_DIR = '../input/house-prices-advanced-regression-techniques/'

In [50]:
train_df = pd.read_csv(DATA_DIR+'train.csv', index_col='Id')
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [51]:
train_df.shape

(1460, 80)

In [52]:
test_df = pd.read_csv(DATA_DIR+'test.csv', index_col='Id')
test_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


# Encode categorical and numerical features

In [53]:
train_df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Categorical columns

In [54]:
cat_cols = train_df.select_dtypes(exclude=["number","bool_"]).columns.to_list()
cat_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

## Numerical columns

In [55]:
num_cols = train_df.select_dtypes(exclude=["object", "category"]).columns.to_list()
num_cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [56]:
train_df.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
Heating           0
HeatingQC         0
MSZoning          0
1stFlrSF          0
SalePrice         0
Length: 80, dtype: int64

In [57]:
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

## Target Encoding

In [58]:

# te = TargetEncoder(cols=cat_cols)

# # transform the datasets
# training_data = te.fit_transform(X, y)
# training_data

# Leave one out encoding

In [59]:
# loo_encoder = LeaveOneOutEncoder(cols=cat_cols, sigma=0.05)
# training_data = loo_encoder.fit_transform(X, y)
# training_data

# Generalized Linearn Mixed Model (GLMM)

In [61]:
glmm_encoder = GLMMEncoder(cols=cat_cols, binomial_target=False)
# binomial_target = True (for Classification)
# binomial_target = False (for Regression)
training_data = glmm_encoder.fit_transform(X,y)

In [62]:
# testing_data = te.transform(test_df)
# testing_data.head()

In [63]:
# testing_data = loo_encoder.transform(test_df)
# testing_data.head()

In [64]:
testing_data = glmm_encoder.transform(test_df)
testing_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,-16451.684026,80.0,11622,15140.940355,24479.623812,-39570.354742,-8867.746905,628.19831,-13682.92426,...,120,0,-87017.423079,-11517.337452,14538.405886,0,6,2010,-11842.541147,1980.498617
1462,20,40804.214113,81.0,14267,15140.940355,24479.623812,1507.119632,-8867.746905,628.19831,-8692.555628,...,0,0,-87017.423079,26067.186411,348.146323,12500,6,2010,-11842.541147,1980.498617
1463,60,40804.214113,74.0,13830,15140.940355,24479.623812,1507.119632,-8867.746905,628.19831,-13682.92426,...,0,0,-87017.423079,-11517.337452,14538.405886,0,3,2010,-11842.541147,1980.498617
1464,60,40804.214113,78.0,9978,15140.940355,24479.623812,1507.119632,-8867.746905,628.19831,-13682.92426,...,0,0,-87017.423079,26067.186411,14538.405886,0,6,2010,-11842.541147,1980.498617
1465,120,40804.214113,43.0,5005,15140.940355,24479.623812,1507.119632,38729.113236,628.19831,-13682.92426,...,144,0,-87017.423079,26067.186411,14538.405886,0,1,2010,-11842.541147,1980.498617


In [65]:
training_data.isnull().sum().sort_values(ascending=False)

LotFrontage      259
GarageYrBlt       81
MasVnrArea         8
KitchenAbvGr       0
GarageType         0
                ... 
ExterCond          0
ExterQual          0
MasVnrType         0
Exterior2nd        0
SaleCondition      0
Length: 79, dtype: int64

In [66]:
my_imputer = SimpleImputer()
data_with_imputed_values = my_imputer.fit_transform(training_data)

In [67]:

X_train, X_test, y_train, y_test = train_test_split(
   data_with_imputed_values, y, test_size=0.1, random_state=42)

In [68]:

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.8709315509548241

In [69]:
testing_data = my_imputer.transform(testing_data)

In [70]:

preds = reg.predict(testing_data)

In [71]:
submission = pd.read_csv(DATA_DIR+'sample_submission.csv')
submission['SalePrice'] = preds

In [72]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,112586.533615
1,1462,165008.085363
2,1463,168999.099867
3,1464,183182.500011
4,1465,220574.126057


In [73]:
submission.to_csv('submission.csv', header=True, index=False)