# Imports

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

In [77]:
train_data = pd.read_csv("../../inputs/train/train.csv").drop("Id", axis=1)
train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [78]:
test_data = pd.read_csv("../../inputs/test/test.csv")
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# Removing outliers

In [79]:
train_data = train_data[train_data.GrLivArea < 4000]
train_data.shape

(1456, 80)

# Missing columns in train and test data

In [80]:
cols_with_missing_train = [col for col in train_data.columns
                     if train_data[col].isnull().any()]
print(cols_with_missing_train)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [81]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1456 non-null   int64  
 1   MSZoning       1456 non-null   object 
 2   LotFrontage    1197 non-null   float64
 3   LotArea        1456 non-null   int64  
 4   Street         1456 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1456 non-null   object 
 7   LandContour    1456 non-null   object 
 8   Utilities      1456 non-null   object 
 9   LotConfig      1456 non-null   object 
 10  LandSlope      1456 non-null   object 
 11  Neighborhood   1456 non-null   object 
 12  Condition1     1456 non-null   object 
 13  Condition2     1456 non-null   object 
 14  BldgType       1456 non-null   object 
 15  HouseStyle     1456 non-null   object 
 16  OverallQual    1456 non-null   int64  
 17  OverallCond    1456 non-null   int64  
 18  YearBuil

In [82]:
cols_with_missing_test = [col for col in test_data.columns
                     if test_data[col].isnull().any()]
print(cols_with_missing_test)

['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']


# Impute missing columns in train and test

In [83]:
missing_cat_values = {
    "PoolQC": "No Pool",
    "MiscFeature": "No Feature",
    "Alley": "No alley access",
    "Fence": "No Fence",
    "FireplaceQu": "No Fireplace",
    "GarageCond": "No Garage",
    "GarageType": "No Garage",
    "GarageArea": 0,
    "GarageFinish": "No Garage",
    "GarageQual": "No Garage",
    "BsmtExposure": "No Basement",
    "BsmtFinType2": "Not Applicable",
    "BsmtFinType1": "Not Applicable",
    "BsmtCond": "No Basement",
    "BsmtQual": "No Basement",
    "MasVnrArea": 0.0,
    "MasVnrType": "No Veneer",
    "LotFrontage": 0.0,
}

train_data.fillna(value=missing_cat_values, inplace=True)
test_data.fillna(value=missing_cat_values, inplace=True)

object_features = train_data.select_dtypes(include=["object"]).columns
object_features = object_features.values
object_features = np.append(object_features, 'GarageYrBlt')


for feature in object_features:
    train_data[feature].fillna(train_data[feature].mode()[0], inplace=True)
    test_data[feature].fillna(test_data[feature].mode()[0], inplace=True)

In [84]:
cols_with_missing_train = [col for col in train_data.columns
                     if train_data[col].isnull().any()]
print(cols_with_missing_train)

[]


In [85]:
cols_with_missing_test = [col for col in test_data.columns
                     if test_data[col].isnull().any()]
print(cols_with_missing_test)

['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars']


In [86]:
for col in cols_with_missing_test:
    test_data[col].fillna(0, inplace=True)

In [87]:
cols_with_missing_test = [col for col in test_data.columns
                     if test_data[col].isnull().any()]
print(cols_with_missing_test)

[]


In [88]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1456 non-null   int64  
 1   MSZoning       1456 non-null   object 
 2   LotFrontage    1456 non-null   float64
 3   LotArea        1456 non-null   int64  
 4   Street         1456 non-null   object 
 5   Alley          1456 non-null   object 
 6   LotShape       1456 non-null   object 
 7   LandContour    1456 non-null   object 
 8   Utilities      1456 non-null   object 
 9   LotConfig      1456 non-null   object 
 10  LandSlope      1456 non-null   object 
 11  Neighborhood   1456 non-null   object 
 12  Condition1     1456 non-null   object 
 13  Condition2     1456 non-null   object 
 14  BldgType       1456 non-null   object 
 15  HouseStyle     1456 non-null   object 
 16  OverallQual    1456 non-null   int64  
 17  OverallCond    1456 non-null   int64  
 18  YearBuil

# Converting Categorical Variables to One-Hot Encoding

In [89]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(
            sparse_matrix.toarray(), columns=new_columns, index=X.index
        )
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f"{column}_<{self.categories_[i][j]}>")
                j += 1
        return new_columns


def transform(data):
    cat_columns = data.select_dtypes(include=["object"]).columns
    cat_df = data[cat_columns]

    numeric_df = data.select_dtypes(include=np.number)

    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    ohe.fit(data[cat_columns])

    df_processed = ohe.transform(cat_df)

    df_processed_full = pd.concat([df_processed, numeric_df], axis=1)

    return df_processed_full

#     string_data = data.select_dtypes(include=['object'])
#     numeric_data = data.select_dtypes(include=np.number)
#     categorical_data = data[string_data]
#     ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
#     ohe.fit(data[string_data])
#     df_processed = ohe.transform(categorical_data)
#     df_processed_full = pd.concat([df_processed, numeric_data], axis=1)
#     return df_processed_full

train_final = transform(train_data)
test_final = transform(test_data)

print(train_final.shape, test_final.shape)

(1456, 303) (1459, 286)


In [90]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Columns: 303 entries, MSZoning_<C (all)> to SalePrice
dtypes: float64(269), int64(34)
memory usage: 3.4 MB


In [91]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 286 entries, MSZoning_<C (all)> to YrSold
dtypes: float64(260), int64(26)
memory usage: 3.2 MB


In [92]:
for column in train_final.columns:
    if column != 'SalePrice' and column not in test_final.columns:
        test_final[column] = 0.0
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 303 entries, MSZoning_<C (all)> to MiscFeature_<TenC>
dtypes: float64(277), int64(26)
memory usage: 3.4 MB


In [93]:
test_final.columns

Index(['MSZoning_<C (all)>', 'MSZoning_<FV>', 'MSZoning_<RH>', 'MSZoning_<RL>',
       'MSZoning_<RM>', 'Street_<Grvl>', 'Street_<Pave>', 'Alley_<Grvl>',
       'Alley_<No alley access>', 'Alley_<Pave>',
       ...
       'RoofMatl_<Roll>', 'Exterior1st_<ImStucc>', 'Exterior1st_<Stone>',
       'Exterior2nd_<Other>', 'Heating_<Floor>', 'Heating_<OthW>',
       'Electrical_<Mix>', 'GarageQual_<Ex>', 'PoolQC_<Fa>',
       'MiscFeature_<TenC>'],
      dtype='object', length=303)

# Using all features to make predictions

In [94]:
final_train_data = train_final.drop('SalePrice', axis = 1)

## Batch size - 50

In [95]:
model1 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=50, shuffle=True)
model1.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=50, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [96]:
pred = model1.predict(test_final[final_train_data.columns].values)

In [97]:
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)

(array([ 61002.833256  ,  61633.73277424,  66936.02083771, ...,
        554076.2092753 , 568162.12250636, 595319.38177264]),
 array([1, 1, 1, ..., 1, 1, 1]))

In [98]:
output.to_csv("../../outputs/all_features_config_1.csv", index=None)

## Batch size - 100

In [99]:
model2 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=100, shuffle=True)
model2.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=100, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [100]:
pred = model2.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/all_features_config_2.csv", index=None)

## Batch size - 200

In [101]:
model3 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=200, shuffle=True)
model3.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=200, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [102]:
pred = model3.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/all_features_config_3.csv", index=None)

## Batch size - 400

In [103]:
model4 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=400, shuffle=True)
model4.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=400, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [104]:
pred = model4.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/all_features_config_4.csv", index=None)

## Batch size - 20

In [105]:
model5 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=20, shuffle=True)
model5.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=20, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [106]:
pred = model5.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/all_features_config_5.csv", index=None)

## Batch size - 10

In [107]:
model6 = MLPRegressor(solver='adam', early_stopping = True, hidden_layer_sizes=(302, 50, 20), random_state=1, max_iter = 2000, batch_size=10, shuffle=True)
model6.fit(final_train_data.values ,train_final['SalePrice'].values)

MLPRegressor(batch_size=10, early_stopping=True,
             hidden_layer_sizes=(302, 50, 20), max_iter=2000, random_state=1)

In [108]:
pred = model6.predict(test_final[final_train_data.columns].values)
output = pd.DataFrame(columns=['Id', 'SalePrice'])
output['Id'] = test_data['Id']
output['SalePrice'] = pred

np.unique(output['SalePrice'], return_index=False, return_inverse=False, return_counts=True, axis=None)
output.to_csv("../../outputs/all_features_config_6.csv", index=None)