In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

import plotly.express as px
import plotly.graph_objects as go
from matplotlib import pyplot as plt
import seaborn as sns

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
subs = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.describe()

# Dataset Preprocessing


In [None]:
train=train.drop(columns="Id")
test=test.drop(columns="Id")

Let's see how many NaNs we have for each feature.

In [None]:
nan_count=100*train.isna().sum().sort_values(ascending=False)/train.shape[0]
fig=px.bar(x=nan_count.index,y=nan_count.values, labels={"y": "Nan ammount (%)","x": "Feature"})
fig.show()

We can remove the features with NaN>40%, while the others will be handled replacing NaN with the respective median value.

In [None]:
train=train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence',"FireplaceQu"])
test=test.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence',"FireplaceQu"])

In [None]:
numeric_features=[ feature  for feature in train.columns if  train[feature].dtypes!="object" and feature!="SalePrice"]
categorical_features=[ feature  for feature in train.columns if  train[feature].dtypes=="object"]

Now we are going to remove the last NaN values with the median value of each feature.

In [None]:
#replacing train NaNs with modes
nans=train.isna().sum()
nans=nans[nans>0]
for feature in nans.index:
    train[feature] = train[feature].fillna(train[feature].mode()[0])
#replacing test NaNs with modes
nans=test.isna().sum()
nans=nans[nans>0]
for feature in nans.index:
    test[feature] = test[feature].fillna(test[feature].mode()[0])

One hot encoding the categorical feature of train and test set.

In [None]:
for feature in categorical_features:    
    #some string values are present only in one of the dataset, so it is needed an unique list of both dataset to avoid conflicts
    for num, value in enumerate(np.unique((list(train[feature].unique())+list(test[feature].unique())))):          
        train[feature+"_"+str(num)]=pd.Series(train[feature]==value,dtype="int")        
        test[feature+"_"+str(num)]=pd.Series(test[feature]==value,dtype="int")
    train=train.drop(columns=feature)
    test=test.drop(columns=feature)
    

In [None]:
train

Standard transformation
of the train and test test (only numeric features).

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
train[numeric_features]=scaler.fit_transform(train[numeric_features])
test[numeric_features]=scaler.transform(test[numeric_features])

Before feeding the data to the Neural Network, we peforme an PCA dimensionality reduction to reduce the noise of the data and to ease the calculation of the neural net.

In [None]:
x_train=train.drop(columns="SalePrice")
y_train=train['SalePrice']
pca = PCA(n_components=train.shape[1]-1)
x_train=pca.fit_transform(x_train)
fig=go.Figure()
fig.add_traces(go.Bar(x=np.arange(train.shape[1]-1),y=np.cumsum(pca.explained_variance_ratio_),name="Cumulative Variance"))
#n_comp will be the number of components that explains the 95% of the data variance
n_comp=np.where(np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0]
fig.add_traces(go.Scatter(x=np.arange(train.shape[1]-1),y=[0.95]*(train.shape[1]-1),name="Variance at 95%"))
fig.update_layout(title="How many components we need?",xaxis_title="Components",yaxis_title="Cumulative Variance", font=dict(
        family="Arial",
        size=18,
    ))
fig.show()
print("With n_components="+str(n_comp)+" we have the 95% of the data variance, so we will choose this value.")

In [None]:
pca = PCA(n_components=n_comp+50)
x_train=pca.fit_transform(train.drop(columns=["SalePrice"]))

# Model definition and training


In [None]:
model = tf.keras.Sequential([
      layers.Dense(2048, activation='relu'),
      layers.Dropout(0.5),
      layers.Dense(2048, activation='relu'),
      layers.Dropout(0.5),
      layers.Dense(1)
  ])
model.compile(loss='mean_squared_error',optimizer=tf.keras.optimizers.Adamax(1e-3))

In [None]:
history = model.fit(x_train,y_train,validation_split=0.1,verbose=0, epochs=300)

In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=np.arange(300), y=history.history['loss'],mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=np.arange(300), y=history.history['val_loss'],mode='lines', name='Validation Loss',))
fig.update_layout(title="MAE loss on train and validation set",xaxis_title="Epoch", yaxis_title="Loss", font=dict(
        family="Arial",
        size=18,
    ))
fig.show()

# Model evaluation and submission


In [None]:
print("Validation loss:",history.history['val_loss'][-1])
print("Training loss:",history.history['loss'][-1])
print("Loss on entire train set:",mean_absolute_error(model.predict(x_train),y_train))
print("R2 score(Train):",r2_score(model.predict(x_train),y_train))

In [None]:
sub_preds = model.predict(pca.transform(test))
subs["SalePrice"] = sub_preds
subs.to_csv("submission.csv", index = False)
print("Submission done!")