# Load Data

In [27]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
df = pd.read_csv("train.csv")

X = df.iloc[:, 0:len(df.columns)-1]
X = pd.get_dummies(X)

Y = df.loc[:, 'target']

### Boxcox transform the Target Column

In [16]:
Y_boxcox, lmda = boxcox(Y)

Use inv_boxcox to transform predicitions back into the native scale

In [33]:
inv_boxcox(Y_boxcox, lmda)

array([21.639999, 19.919998, 21.150001, ..., 23.36    , 20.1407  ,
       21.3     ])

### Make Train/Test Split

In [17]:
train_X, val_X, train_Y, val_Y = train_test_split(X, Y, test_size = 0.20, random_state = 42)
train_X_bc, val_X_bc, train_Y_bc, val_Y_bc = train_test_split(X, Y, test_size = 0.20, random_state = 42)

### Get baseline predictions for each transformation of Target

Essentially, guess the average of the column

In [35]:
baseline_preds = val_Y.mean()
baseline_errors = abs(val_Y - baseline_preds)
print('Average baseline MAE: ', round(np.mean(baseline_errors), 2))

baseline_preds_bc = val_Y_bc.mean()
baseline_errors_bc = abs(val_Y_bc - baseline_preds_bc)
print('BoxCox Average baseline MAE: ', round(np.mean(baseline_errors_bc), 2))


Average baseline MAE:  0.93
BoxCox Average baseline MAE:  0.93


# Lasso

In [97]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [66]:
tuned_parameters = [{'alpha': [0.4, 0.5, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.7, 0.8, 0.9, 1]}]

Find the best gamma for both normal and boxcox transformed 

In [68]:
clf = GridSearchCV(Lasso(), tuned_parameters, cv=5,
                       scoring='neg_mean_absolute_error')
clf.fit(train_X, train_Y)
clf.best_params_

{'alpha': 0.62}

In [67]:
clf_bc =  GridSearchCV(Lasso(), tuned_parameters, cv=5,
                       scoring='neg_mean_absolute_error')
clf.fit(train_X_bc, train_Y_bc)
clf.best_params_

{'alpha': 0.62}

In [88]:
lasso_fit = Lasso(0.62, random_state=450, tol=0.0000000001)
lasso_fit = lasso_fit.fit(train_X, train_Y)

predictions = lasso_fit.predict(val_X)

# Check GOF, Calculate MAE
errors = abs(val_Y - predictions)

print('Mean Absolute Error:', round(np.mean(errors), 2))

coef = lasso_fit.coef_

lasso_features = train_X.columns[coef != 0]

Mean Absolute Error: 0.93


In [89]:
lasso_features

Index(['num1', 'num30', 'num37', 'num59'], dtype='object')

In [79]:
lasso_fit_bc = lasso_fit.fit(train_X_bc, train_Y_bc)

predictions = lasso_fit_bc.predict(val_X_bc)

# Check GOF, Calculate MAE
errors = abs(val_Y_bc - predictions)
errors_trans = abs(val_Y - inv_boxcox(predictions, lmda))

print('MAE BoxCox:', round(np.mean(errors), 2))
print('MAE BoxCox, Transformed:', round(np.mean(errors_trans), 2))


# Check out non-zero coefficients 
coef = lasso_fit_bc.coef_

#train_X.columns[coef != 0]

MAE BoxCox: 0.93
MAE BoxCox, Transformed: 11.78


# XgBoost

In [93]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score

Use a hyperparameter grid search: https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

In [114]:
params = {
        'min_child_weight': [1, 2, 4],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1],
        'colsample_bytree': [0.5, 0.7, 0.9],
        'max_depth': [3, 5, 6, 8]
        }

In [111]:
xgb = XGBRegressor(learning_rate=0.2, n_estimators=600, objective='reg:linear', eval_metric='mae',
                    silent=True)

In [117]:
folds = 5
param_comb = 50

kf = KFold(n_splits=folds,shuffle=True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb,
                                   scoring='neg_mean_absolute_error', n_jobs=4,
                                   cv=kf.split(train_X,train_Y), verbose=3, random_state=1001 )

random_search.fit(train_X, train_Y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 15.5min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 31.2min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000000957E6AB570>,
          error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mae', gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'min_child_weight': [1, 2, 4], 'gamma': [0, 0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1], 'colsample_bytree': [0.5, 0.7, 0.9], 'max_depth': [3, 5, 6, 8]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='neg_mean_absolute_error',
          verbose=3)

In [118]:
random_search.best_params_

{'colsample_bytree': 0.7,
 'gamma': 5,
 'max_depth': 3,
 'min_child_weight': 2,
 'subsample': 1}

In [119]:
predictions = random_search.predict(val_X)

# Check GOF, Calculate MAE
errors = abs(val_Y - predictions)
errors_trans = abs(val_Y - inv_boxcox(predictions, lmda))

print('MAE BoxCox:', round(np.mean(errors), 2))
print('MAE BoxCox, Transformed:', round(np.mean(errors_trans), 2))


# Check out non-zero coefficients 
#coef = lasso_fit_bc.coef_

#train_X.columns[coef != 0]

MAE BoxCox: 0.98
MAE BoxCox, Transformed: 11.78


Use the best params to fit some models

In [123]:
xgb = XGBRegressor(learning_rate=0.1, n_estimators=1000, objective='reg:linear', eval_metric='mae',
                    silent=True, **random_search.best_params_ )

In [126]:
xgb.fit(train_X, train_Y)
predictions = xgb.predict(val_X)
errors = abs(val_Y - predictions)
print("MAE: ", round(np.mean(errors), 2))

MAE:  0.97


BoxCox

In [127]:
xgb.fit(train_X_bc, train_Y_bc)
predictions = xgb.predict(val_X_bc)
errors = abs(val_Y_bc - predictions)
errors_trans = abs(val_Y - inv_boxcox(predictions, lmda))
print("MAE Boxcox: ", round(np.mean(errors), 2))
print("MAE un-Boxcox: ", round(np.mean(errors_trans), 2))

MAE Boxcox:  0.97
MAE un-Boxcox:  11.78


# Simple Neural Network

### First, We must standardize the input to be between 1 and 0

Also, let's start off with only the features identified to be useful to lasso!

In [152]:
def range_std(column):
    # Helper function, range standardizes a list of values
    return (column - column.min())/(column.max() - column.min())

def range_std_reversible(column):
    # Helper function, range standardizes a list of values
    # Returns min and max for later reversal
    return (column - column.min())/(column.max() - column.min()), column.min(), column.max()

def range_std_reverse(column, min_val, max_value):
    return (column*(max_value - min_val) + min_val)

In [137]:
features_to_standardize = [*lasso_features, 'target']
df_thin = df.loc[:, features_to_standardize]

In [139]:
for column_name in features_to_standardize:
    df_thin[column_name] = range_std(df_thin[column_name])
    
df_thin.describe()

In [142]:
# Use these as our training set
trainY_std = df_thin.loc[:, 'target']
trainX_std = df_thin.drop(columns='target')

In [153]:
# Boxcox transform
trainY_std_bc, min_value, max_value = range_std_reversible(trainY_boxcox)

In [162]:
# This is how we will return the range standardized, boxcox transformed values to their original scale
inv_boxcox(range_std_reverse(trainY_std_bc, min_value, max_value), lmda)

array([21.639999, 19.919998, 21.150001, ..., 23.36    , 20.1407  ,
       21.3     ])

## Simple Nueral

In [163]:
lasso_features

Index(['num1', 'num30', 'num37', 'num59'], dtype='object')

In [166]:
def simplemodel():
    model = Sequential()
    model.add(Dense(8, input_shape= (4,), kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(4)))
    model.add(Dropout(0.1))
    model.add(Dense(1, kernel_initializer='normal'))
    sgd = SGD(lr=0.09, momentum=0.8, decay=0.00)
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [167]:
# evaluate model with standardized dataset
seed = 12345
np.random.seed(seed)
estimators = []
estimators.append(('mlp', KerasRegressor(build_fn=simplemodel, epochs=100, verbose=True)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, np.array(trainX_std), np.array(trainY_std), cv=kfold)
print("Standardized: %.2f (%.2f) MAE" % (results.mean(), results.std()))

NameError: name 'KerasRegressor' is not defined