# Regression Problem

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from keras import models
from keras import layers
from keras.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [None]:
# Access to google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load Dataset

In [None]:
# Read dataset from google drive
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/housing.data', delim_whitespace=True,
                 header=None, names=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'])
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [None]:
df.shape

(506, 14)

## Preprocess

### Separate features and label




In [None]:
# seperate the target column
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [None]:
y.head(5)

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

### Split data to test/train

In [None]:
# Split test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print("X_train dataset: ", X_train.shape)
print("y_train dataset: ", y_train.shape)
print("X_test dataset: ", X_test.shape)
print("y_test dataset: ", y_test.shape)

X_train dataset:  (404, 13)
y_train dataset:  (404,)
X_test dataset:  (102, 13)
y_test dataset:  (102,)


### Normalize data
For the type of normalization described in the question, we should use the z-score method.
The z-score method (standardization) transforms the info into distribution with a mean of 0 and a typical deviation of 1.

In [None]:
X_train_scaled = X_train.copy()

# Each standardized value is computed by subtracting the mean of the corresponding feature then dividing by the quality deviation
for column in X_train_scaled.columns:
    X_train_scaled[column] = (X_train_scaled[column] - X_train[column].mean()) / X_train[column].std()

X_train_scaled.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
355,-0.401334,2.931428,-1.307359,-0.27255,-1.200345,-0.473519,-1.725043,3.171089,-0.598467,-0.408403,1.665298,0.194907,-0.988461
460,0.171557,-0.48658,1.003605,-0.27255,1.395894,0.608392,0.764301,-0.572458,1.752341,1.587163,0.835672,-1.190983,0.502556
120,-0.405912,-0.48658,2.081293,-0.27255,0.253549,-0.566861,0.047511,-0.7317,-0.833548,-1.285971,0.328679,0.345301,0.220843
346,-0.406809,-0.48658,-0.953363,-0.27255,-0.949375,-0.527261,-0.56688,1.965714,-0.716007,-0.30021,0.190408,0.063786,-0.012773
457,0.584094,-0.48658,1.003605,-0.27255,1.395894,-0.473519,0.421795,-0.487307,1.752341,1.587163,0.835672,-4.078743,0.574015


Do the same thing for features of test set BUT with mean and std of trainset

In [None]:
X_test_scaled = X_test.copy()

# Each standardized value is computed by subtracting the mean of the corresponding feature then dividing by the quality deviation
for column in X_test_scaled.columns:
    X_test_scaled[column] = (X_test_scaled[column] - X_train[column].mean()) / X_train[column].std()

X_test_scaled.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
357,0.054383,-0.48658,1.003605,3.659963,1.88918,0.175628,0.79961,-0.615713,1.752341,1.587163,0.835672,0.370424,0.06968
337,-0.410609,-0.48658,-0.839171,-0.27255,-0.317624,-0.531504,-0.309118,0.841647,-0.480926,-1.069585,0.835672,0.41023,-0.302731
327,-0.384966,-0.48658,-0.52657,-0.27255,-0.508015,-0.265622,-0.870545,0.748342,-0.480926,-0.690908,0.55913,0.434206,0.003718
13,-0.337639,-0.48658,-0.418087,-0.27255,-0.118579,-0.455134,-0.231437,0.416361,-0.598467,-0.570693,1.204395,0.434206,-0.618799
418,8.538325,-0.48658,1.003605,-0.27255,1.101654,-0.44382,1.117399,-0.944976,1.752341,1.587163,0.835672,-3.930185,1.079724


## MLP

In [None]:
def design_model():
  # Architecture
  model = models.Sequential()
  ## 3 hidden layers
  model.add(layers.Dense(256, activation='relu', input_shape=(13,)))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(16, activation='relu'))
  ## output layer
  model.add(layers.Dense(1))

  # Compilation
  model.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['mean_absolute_error'])
  return model

In [None]:
folds = KFold(n_splits=5, random_state=None)
model = design_model()

mae_values = []

for train_idx, val_idx in folds.split(X_train_scaled):
  X_trn, X_val = X_train_scaled.iloc[train_idx,:], X_train_scaled.iloc[val_idx,:]
  y_trn, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

  # history = model.fit(X_trn, y_trn, epochs=4, batch_size=1, validation_data=(X_val, y_val))
  # history_dict = history.history
  # mae = history_dict['val_mean_absolute_error']
  # mae_values.append(mae)

  model.fit(X_trn, y_trn, epochs=4, batch_size=1)
  y_pred = model.predict(X_val).reshape(y_val.shape)

  mae = mean_absolute_error(y_val, y_pred)
  mae_values.append(mae)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
print('MAE of each fold:')
print([mae_values[i].numpy() for i in range(len(mae_values))])

MAE of each fold:
[2.441103, 2.689054, 2.1749094, 2.2166903, 1.9808795]


In [None]:
test_mse, test_mae = model.evaluate(X_test_scaled, y_test)
print('Loss of testset (MSE):', test_mse)
print('Metric of testset (MAE):', test_mae)

Loss of testset (MSE): 26.55545425415039
Metric of testset (MAE): 2.99776029586792
