In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

# Houses Kaggle Competition (bis 🔥) 

[<img src='https://github.com/lewagon/data-images/blob/master/ML/kaggle-batch-challenge.png?raw=true' width=600>](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

Let's re-use our previous pipeline built in the module `05-07-Ensemble-Methods` and improve final predictions using a Neural Network!

# Re-use already-built preprocessing

## Load data

In [2]:
# Let's load our training dataset
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/houses_train_raw.csv")
X = data.drop(columns='SalePrice')
y = data['SalePrice']

# You don't have access to y_yest! Only Kaggle has it.
X_test = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/houses_test_raw.csv")

print(X.shape, y.shape, X_test.shape)

(1460, 80) (1460,) (1459, 80)


## Import preprocessor

You will find the data-preprocessing pipeline that was built in our previous iteration in `utils/preprocessor.py`. 

❓ Run the cell below, and make sure you understand what the pipeline does. Look at the code in `preprocessor.py`

In [3]:
from utils.preprocessor import create_preproc
preproc = create_preproc(X)
preproc

❓ Fit the preprocessor on your train set and create your feature matrix `X_preproc` that will be used by the Neural Network

In [5]:
# YOUR CODE HERE
X_preproc = preproc.fit_transform(X,y)
X_preproc

array([[0.11977972, 0.41355932, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.21294172, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.13446535, 0.41937046, 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.19596145, 0.55786925, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.17072051, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.21156494, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [6]:
shape_preproc_baseline = X_preproc.shape
shape_preproc_baseline

(1460, 162)

# Your prediction in Keras

This is your first **regression** task with Keras! 
- The cell below contains the compiler and the hyper-parameters we recommend you to start with.
- Kaggle's [rule](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview/evaluation) requires to minimize `rmsle` (Root Mean Square Log Error). As you can see, we were to specify `msle` direcly as loss-function with Keras! Just remember to take square-root of your loss results to read your rmsle metric.
- The best boosted-tree `rmsle` score to beat is around **0.13**

❓ **Question** ❓
- Your responsibility is to build the best model architecture, and to control the epoch number to avoid overfitting.
- We recommand you to create a train/val split upfront to visually control the validation loss thanks to `plot_history`

In [13]:
# Create a train val split here
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_preproc,y,test_size=0.3, shuffle=True)

In [20]:
from tensorflow.keras import Sequential, layers
def initialize_model(X):

    # Model architecture
    model = Sequential()
    model.add(layers.Dense(20, activation='relu', input_dim=X.shape[1]))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(20, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    
    
    # Recommended compilator hyperparams
    model.compile(optimizer='adam',
                  loss='msle', # we directly optimize for the kaggle's metric!
    ) 
    
    return model

In [21]:
model = initialize_model(X_train)
model.summary()
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=500, # Play with this until your validation loss overfit
                    batch_size=16, # Keep batch size to 16 today
                    verbose=0)

2021-11-08 21:42:18.741999: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-08 21:42:18.886792: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                3260      
_________________________________________________________________
dense_1 (Dense)              (None, 15)                315       
_________________________________________________________________
dense_2 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_3 (Dense)              (None, 20)                320       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 21        
Total params: 4,156
Trainable params: 4,156
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.evaluate(X_val, y_val)**0.5



0.1525622170784408

In [26]:
min(history.history['val_loss'])**0.5

0.15080108610926973

In [22]:
def plot_history(history):
    plt.plot(np.sqrt(history.history['loss']))
    plt.plot(np.sqrt(history.history['val_loss']))
    plt.title('Model Loss')
    plt.ylabel('MSLE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='best')
    plt.show()

## (Optional): AutoML with Keras Tuner

Try to find the best number of neurons and hidden layers **automatically** using the amazing [Keras Tuner](https://www.tensorflow.org/tutorials/keras/keras_tuner)




In [0]:
# TODO: Solution


# 🏅FINAL SUBMISSION

Predict the house prices of your test set and submit your results to kaggle! Be careful with the format of the exported `.csv`.

In [0]:
X_test = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/houses_test_raw.csv")

In [0]:
# YOUR CODE HERE