## Learn model
## Content
  * Load data
  * Select features
  * Learning
    * linear
    * lasso
    * ridge
    * elastic net
    * Xgboost
    * MLP
  * Compare results

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
from keras import metrics
import warnings
import sklearn.linear_model as linear_model
import sklearn.ensemble as ensemble
import sklearn.neighbors as neighbors
import sklearn.svm as svm

%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
MONGODB_URL = os.environ['AMES_MONGODB_URI']
client = MongoClient(MONGODB_URL)
db = client.get_default_database()

In [None]:
data = db["notebook"].find({})
full_frame = pd.DataFrame(list(data))
full_frame.drop(columns=["_id"], inplace=True)
full_frame.shape

### Select features

In [None]:
features = [
#  'GrLivArea',
]

to_log_transform = [
#     'GrLivArea', 
]

to_pow_transform = [
#     'GrLivArea', 
]

to_boolean_transform = {
#     'TotalBsmtSF': {'new_feature_name': 'HasBasement', 'threshold': 0},
}

## Learn model

In [None]:
def log_transformation(frame, feature):
    new_feature_name = new_log_feature_name(feature)
    frame[new_feature_name] = np.log1p(frame[feature].values)

def new_quadratic_feature_name(feature):
    return feature+'2'

def new_log_feature_name(feature):
    return feature+'Log'
    
def quadratic(frame, feature):
    new_feature_name = new_quadratic_feature_name(feature)
    frame[new_feature_name] = frame[feature]**2
    
def boolean_transformation(frame, feature, new_feature_name, threshold):
    frame[new_feature_name] = frame[feature].apply(lambda x: 1 if x > threshold else 0)
    
def error(actual, predicted):
    actual = np.log(actual)
    predicted = np.log(predicted)
    return np.sqrt(np.sum(np.square(actual-predicted))/len(actual))

def error_mse(actual, predicted):
    actual = (actual)
    predicted = (predicted)
    return np.sqrt(np.sum(np.square(actual-predicted))/len(actual))

In [None]:
added_boolean_columns = [to_boolean_transform[o]['new_feature_name'] for o in to_boolean_transform.keys()]

added_quadratic_columns = list(map(new_quadratic_feature_name, to_pow_transform))

added_log_columns = list(map(new_log_feature_name, to_log_transform))

def transform_before_learn(frame, to_log_transform, to_pow_transform, to_boolean_transform):

    for c in to_log_transform:
        log_transformation(frame, c)

    for c in to_pow_transform:
        quadratic(frame, c)

    for c in to_boolean_transform.keys():
        boolean_transformation(frame, c, to_boolean_transform[c]['new_feature_name'], 
                               to_boolean_transform[c]['threshold']) 


transform_before_learn(full_frame, to_log_transform, to_pow_transform, to_boolean_transform)

df_train = full_frame[:1460]
df_test = full_frame[1460:]

# features = list(set(features) - set(to_log_transform) - set(to_pow_transform))

In [None]:
features_full_list = features + added_boolean_columns + added_quadratic_columns + added_log_columns

## Out liars

In [None]:
df_train_cleaned = df_train
#df_train_cleaned = df_train.drop(df_train[df_train['Id'] == 1299].index)
#df_train_cleaned = df_train.drop(df_train[df_train['Id'] == 524].index)

### LinearRegression

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

linear = linear_model.LinearRegression()
linear.fit(X, np.log1p(Y))

Ypred_linear = np.expm1(linear.predict(full_X))
print(error(full_Y, Ypred_linear))
print(error_mse(full_Y, Ypred_linear))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(linear.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### LassoCV

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

lasso = linear_model.LassoCV()
lasso.fit(X, np.log1p(Y))

Ypred_lasso = np.expm1(lasso.predict(full_X))
print(error(full_Y, Ypred_lasso))
print(error_mse(full_Y, Ypred_linear))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(lasso.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### RidgeCV

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

ridge = linear_model.RidgeCV()
ridge.fit(X, np.log1p(Y))
Ypred_ridge = np.expm1(ridge.predict(full_X))
print(error(full_Y,Ypred_ridge))
print(error_mse(full_Y, Ypred_ridge))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(ridge.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### ElasticNetCV

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

elasticNet = linear_model.ElasticNetCV()
elasticNet.fit(X, np.log1p(Y))
Ypred_elasticNet = np.expm1(elasticNet.predict(full_X))
print(error(full_Y,Ypred_elasticNet))
print(error_mse(full_Y, Ypred_elasticNet))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(elasticNet.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### Xgboost

In [None]:
import xgboost as xgb

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X, np.log1p(Y), random_state = 42, test_size=0.20)

eval_set = [(X_val, y_val)]

In [None]:
model_xgb = xgb.XGBRegressor(n_estimators=1000, max_depth=2, learning_rate=0.1)
model_xgb.fit(X_tr, y_tr, eval_metric="rmse", early_stopping_rounds=500, eval_set=eval_set, verbose=True)
# model_xgb.fit(X, np.log1p(Y))

In [None]:
Ypred_xgb = np.expm1(model_xgb.predict(full_X))
print(error(full_Y, Ypred_xgb))
print(error_mse(full_Y, Ypred_xgb))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(model_xgb.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

## MLP

In [None]:
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential
from keras.regularizers import l1
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint 
from keras.callbacks import LearningRateScheduler
from keras import optimizers
from sklearn.preprocessing import StandardScaler
from keras import backend as K
from keras import losses

tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(42)
np.random.seed(42)

In [None]:
mlp_feed = df_train[features]

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(mlp_feed, np.log1p(Y), random_state = 7, test_size=0.20)

In [None]:
model = Sequential()
model.add(Dense(10, input_dim=mlp_feed.shape[1], activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(1))

# adam = optimizers.Adam()
# model.compile(loss = losses.mean_squared_error, optimizer = adam)

model.compile(optimizer ='adam', loss = 'mean_squared_error', metrics =[metrics.mae])

In [None]:
model.summary()

In [None]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=20, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True)  # save best model

In [None]:
hist = model.fit(X_tr, y_tr, 
                 batch_size=1, 
                 validation_data = (X_val, y_val), 
                 callbacks=[monitor, checkpointer], 
                 verbose=1, 
                 epochs=150)

In [None]:
model.load_weights('best_weights.hdf5')  # load weights from best model

# Measure accuracy
Ypred_mlp = np.expm1(model.predict(mlp_feed))
print(error(df_train['SalePrice'].values,Ypred_mlp))
print(error_mse(df_train['SalePrice'].values, Ypred_mlp))

#### test dataset

In [None]:
full_test_X = df_test[features]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(model.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### RandomForest

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

randomForest = ensemble.RandomForestRegressor(n_estimators=100)
randomForest.fit(X, np.log1p(Y))
Ypred_randomForest = np.expm1(randomForest.predict(full_X))
print(error(full_Y,Ypred_randomForest))
print(error_mse(full_Y, Ypred_randomForest))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(randomForest.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### KNeighbors

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

knn = neighbors.KNeighborsRegressor()
knn.fit(X, np.log1p(Y))
Ypred_knn = np.expm1(knn.predict(full_X))
print(error(full_Y,Ypred_knn))
print(error_mse(full_Y, Ypred_knn))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(knn.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

### SVR

In [None]:
X = df_train_cleaned[features_full_list]
Y = df_train_cleaned['SalePrice'].values

full_X = df_train[features_full_list]
full_Y = df_train['SalePrice'].values

svr = svm.SVR()
svr.fit(X, np.log1p(Y))
Ypred_svr = np.expm1(svr.predict(full_X))
print(error(full_Y,Ypred_svr))
print(error_mse(full_Y, Ypred_svr))

#### test dataset

In [None]:
full_test_X = df_test[features_full_list]
test_Y = df_test['SalePrice'].values

test_pred = np.expm1(svr.predict(full_test_X))

print(error(test_Y, test_pred))
print(error_mse(test_Y, test_pred))

## Compare results

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "lasso":Ypred_lasso})
predictions.plot(x = "xgb", y = "lasso", kind = "scatter", grid=True)

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "rigde":Ypred_ridge})
predictions.plot(x = "xgb", y = "rigde", kind = "scatter", grid=True)

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "linear":Ypred_linear})
predictions.plot(x = "xgb", y = "linear", kind = "scatter", grid=True)

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "randomForest":Ypred_randomForest})
predictions.plot(x = "xgb", y = "randomForest", kind = "scatter", grid=True)

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "knn":Ypred_knn})
predictions.plot(x = "xgb", y = "knn", kind = "scatter", grid=True)

In [None]:
predictions = pd.DataFrame({"xgb":Ypred_xgb, "svr":Ypred_svr})
predictions.plot(x = "xgb", y = "svr", kind = "scatter", grid=True)