# Imports

In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

In [96]:
train_data = pd.read_csv("../../inputs/train/train.csv").drop("Id", axis=1)
train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [97]:
test_data = pd.read_csv("../../inputs/test/test.csv")
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# Removing outliers

In [98]:
train_data = train_data[train_data.GrLivArea < 4000]
train_data.shape

(1456, 80)

# Missing columns in train and test data

In [99]:
cols_with_missing_train = [col for col in train_data.columns
                     if train_data[col].isnull().any()]
print(cols_with_missing_train)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [100]:
cols_with_missing_test = [col for col in test_data.columns
                     if test_data[col].isnull().any()]
print(cols_with_missing_test)

['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']


# Removing missing columns in train and test

In [101]:
train_data = train_data.drop(cols_with_missing_test, axis='columns')
train_data.shape

(1456, 47)

In [102]:
train_data.columns

Index(['MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
       'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleCondition', 'SalePrice'],
      dtype='object')

In [103]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Data columns (total 47 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1456 non-null   int64 
 1   LotArea        1456 non-null   int64 
 2   Street         1456 non-null   object
 3   LotShape       1456 non-null   object
 4   LandContour    1456 non-null   object
 5   LotConfig      1456 non-null   object
 6   LandSlope      1456 non-null   object
 7   Neighborhood   1456 non-null   object
 8   Condition1     1456 non-null   object
 9   Condition2     1456 non-null   object
 10  BldgType       1456 non-null   object
 11  HouseStyle     1456 non-null   object
 12  OverallQual    1456 non-null   int64 
 13  OverallCond    1456 non-null   int64 
 14  YearBuilt      1456 non-null   int64 
 15  YearRemodAdd   1456 non-null   int64 
 16  RoofStyle      1456 non-null   object
 17  RoofMatl       1456 non-null   object
 18  ExterQual      1456 non-null

In [104]:
test_data = test_data.drop(cols_with_missing_test, axis='columns')
test_data.shape

(1459, 47)

# Converting Categorical Variables to One-Hot Encoding

In [105]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(
            sparse_matrix.toarray(), columns=new_columns, index=X.index
        )
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f"{column}_<{self.categories_[i][j]}>")
                j += 1
        return new_columns


def transform(data):
    cat_columns = data.select_dtypes(include=["object"]).columns
    cat_df = data[cat_columns]

    numeric_df = data.select_dtypes(include=np.number)

    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    ohe.fit(data[cat_columns])

    df_processed = ohe.transform(cat_df)

    df_processed_full = pd.concat([df_processed, numeric_df], axis=1)

    return df_processed_full

#     string_data = data.select_dtypes(include=['object'])
#     numeric_data = data.select_dtypes(include=np.number)
#     categorical_data = data[string_data]
#     ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
#     ohe.fit(data[string_data])
#     df_processed = ohe.transform(categorical_data)
#     df_processed_full = pd.concat([df_processed, numeric_data], axis=1)
#     return df_processed_full

train_final = transform(train_data)
test_final = transform(test_data)

print(train_final.shape, test_final.shape)

(1456, 155) (1459, 144)


In [106]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Columns: 155 entries, Street_<Grvl> to SalePrice
dtypes: float64(129), int64(26)
memory usage: 1.7 MB


In [107]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 144 entries, Street_<Grvl> to YrSold
dtypes: float64(118), int64(26)
memory usage: 1.6 MB


In [108]:
for column in train_final.columns:
    if column != 'SalePrice' and column not in test_final.columns:
        test_final[column] = 0.0
test_final.info()

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Here
True
Here
True
Here
True
True
True
True
True
True
True
True
True
Here
True
True
True
True
True
True
True
True
True
True
True
True
Here
True
Here
True
Here
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Here
True
True
True
True
Here
True
True
True
True
True
True
True
True
True
True
True
True
Here
True
True
Here
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 155 entries, Street_<Grvl> to Electrical_<nan>
dtypes: float64(129), int64(26)
memory 

In [112]:
test_final.columns

Index(['Street_<Grvl>', 'Street_<Pave>', 'LotShape_<IR1>', 'LotShape_<IR2>',
       'LotShape_<IR3>', 'LotShape_<Reg>', 'LandContour_<Bnk>',
       'LandContour_<HLS>', 'LandContour_<Low>', 'LandContour_<Lvl>',
       ...
       'Condition2_<RRAn>', 'Condition2_<RRNn>', 'HouseStyle_<2.5Fin>',
       'RoofMatl_<Membran>', 'RoofMatl_<Metal>', 'RoofMatl_<Roll>',
       'Heating_<Floor>', 'Heating_<OthW>', 'Electrical_<Mix>',
       'Electrical_<nan>'],
      dtype='object', length=155)

# Using non-null 46 features to make predictions

In [113]:
final_train_data = train_final.drop('SalePrice', axis = 1)

## Batch size - 50

In [114]:
model1 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=50, shuffle=True)
model1.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=50, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [115]:
pred = model1.predict(test_final[final_train_data.columns].values)

In [116]:
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)

(array([ 44158.33390457,  45959.22270662,  49630.12366328, ...,
        492486.64259212, 557206.1257662 , 612750.10902466]),
 array([1, 1, 1, ..., 1, 1, 1]))

In [118]:
output.to_csv("../../outputs/output_without_null_features_config_1.csv", index=None)

## Batch size - 100

In [119]:
model2 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=100, shuffle=True)
model2.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=100, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [120]:
pred = model2.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/output_without_null_features_config_2.csv", index=None)

## Batch size - 200

In [121]:
model3 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=200, shuffle=True)
model3.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=200, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [122]:
pred = model3.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/output_without_null_features_config_3.csv", index=None)

## Batch size - 400

In [123]:
model4 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=400, shuffle=True)
model4.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=400, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [124]:
pred = model4.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/output_without_null_features_config_4.csv", index=None)

## Batch size - 20

In [125]:
model5 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=20, shuffle=True)
model5.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=20, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [126]:
pred = model5.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/output_without_null_features_config_5.csv", index=None)

## Batch size - 10

In [127]:
model6 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(154, 20), random_state=1, max_iter = 2000, batch_size=10, shuffle=True)
model6.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=10, early_stopping=True, hidden_layer_sizes=(154, 20),
             max_iter=2000, random_state=1)

In [128]:
pred = model6.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/output_without_null_features_config_6.csv", index=None)