# Algoritm Selection and Tranning Model

## Imports

In [1]:
# data analysis and data wrangling
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb

# metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

# Other
from IPython.display import Image
import configparser
import subprocess
import warnings
import pprint
import time
import os

In [2]:
warnings.filterwarnings('ignore')

## Prepare Principal Directory

### Load dataset

In [4]:
%%time

# load data in dataset
df_train = pd.read_csv("train-clean.csv", 
                       encoding='utf-8')
df_test = pd.read_csv("test-clean.csv", 
                      encoding='utf-8')

Wall time: 3.15 s


In [5]:
df_train.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11,loss
0,0,1,0,1,0,0,0,0,1,0,...,2,19,55,0,14,269,0.245921,0.33506,0.569745,2213.18
1,0,1,0,0,0,0,0,0,1,1,...,0,22,38,0,14,85,0.737068,0.436585,0.338312,1283.6
2,0,1,0,0,1,0,0,0,1,1,...,0,28,5,0,8,153,0.358319,0.315545,0.381398,3005.09
3,1,1,0,1,0,0,0,0,1,0,...,2,39,4,0,14,79,0.555782,0.391128,0.327915,939.85
4,0,1,0,1,0,0,0,0,1,1,...,2,50,38,0,10,55,0.15999,0.247408,0.204687,2763.85


### Global Variables

In [6]:
# Lists that will be manipulated in the data processing
list_columns = []
list_categorical_col = []
list_numerical_col = []

In [7]:
def get_col(df: 'dataframe', type_descr: 'numpy') -> list:
    """
    Function get list columns 
    
    Args:
    type_descr
        np.number, np.object -> return list with all columns
        np.number            -> return list numerical columns 
        np.object            -> return list object columns
    """
    try:
        col = (df.describe(include=type_descr).columns)  # pandas.core.indexes.base.Index  
    except ValueError:
        print(f'Dataframe not contains {type_descr} columns !', end='\n')    
    else:
        return col.tolist() 

In [8]:
list_numerical_col = get_col(df=df_train,
                             type_descr=np.number)
list_categorical_col = get_col(df=df_train,
                               type_descr=np.object)
list_columns = get_col(df=df_train,
                       type_descr=[np.object, np.number])

Dataframe not contains <class 'object'> columns !


---

## Prepare Submission File
Use function to submission. In this way it is guaranteed a default.

In [9]:
# First, check how is file sample
sample = pd.read_csv('data/sample_submission.csv')
sample.head(10)

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0
5,17,0
6,21,0
7,28,0
8,32,0
9,43,0


In [10]:
test_ids = df_test['id']

submissions_folder = 'data/'

In [11]:
def save_predictions(ids = None, predictions = None, file = None):
    
    # prepare file
    submission = pd.DataFrame({'id': ids, 'loss': predictions})
    
    # CSV
    submission.to_csv(path_or_buf = file, index = False, encoding='utf8')
    print("Data storage!")

---

## Split train and test
- The variable Shift is applied to the log transformation.

In [12]:
list_columns.remove('loss')

In [13]:
df_train[list_columns].head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,1,0,1,0,0,0,0,1,0,...,28,2,19,55,0,14,269,0.245921,0.33506,0.569745
1,0,1,0,0,0,0,0,0,1,1,...,65,0,22,38,0,14,85,0.737068,0.436585,0.338312
2,0,1,0,0,1,0,0,0,1,1,...,85,0,28,5,0,8,153,0.358319,0.315545,0.381398
3,1,1,0,1,0,0,0,0,1,0,...,67,2,39,4,0,14,79,0.555782,0.391128,0.327915
4,0,1,0,1,0,0,0,0,1,1,...,50,2,50,38,0,10,55,0.15999,0.247408,0.204687


In [14]:
# split into training and test sets
shift = 200

# create target label
X_train = df_train[list_columns]
y_train = np.log(df_train['loss'] + shift)

In [15]:
X_test = df_test[list_columns]

In [16]:
X_train.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,1,0,1,0,0,0,0,1,0,...,28,2,19,55,0,14,269,0.245921,0.33506,0.569745
1,0,1,0,0,0,0,0,0,1,1,...,65,0,22,38,0,14,85,0.737068,0.436585,0.338312
2,0,1,0,0,1,0,0,0,1,1,...,85,0,28,5,0,8,153,0.358319,0.315545,0.381398
3,1,1,0,1,0,0,0,0,1,0,...,67,2,39,4,0,14,79,0.555782,0.391128,0.327915
4,0,1,0,1,0,0,0,0,1,1,...,50,2,50,38,0,10,55,0.15999,0.247408,0.204687


In [17]:
X_test.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,1,0,0,0,0,0,0,1,0,...,26,0,35,23,0,16,169,0.299102,0.317681,0.377724
1,0,1,0,1,0,0,0,0,1,0,...,58,3,32,58,0,11,173,0.620805,0.44376,0.689039
2,0,1,0,1,1,0,1,0,1,1,...,62,2,46,4,0,10,51,0.737068,0.325779,0.24541
3,0,0,0,0,1,0,0,0,0,0,...,61,0,25,9,0,15,76,0.681761,0.342355,0.348867
4,1,0,0,0,0,1,0,0,0,0,...,100,0,30,45,2,9,163,0.299102,0.391833,0.359572


In [18]:
y_train.head()

0    7.788701
1    7.302227
2    8.072495
3    7.038652
4    7.994244
Name: loss, dtype: float64

In [19]:
display(y_train.head())

0    7.788701
1    7.302227
2    8.072495
3    7.038652
4    7.994244
Name: loss, dtype: float64

In [20]:
# check distribuition
print ("Xtrain shape:", X_train.shape[0])
print ("ytrain shape:", X_train.shape[1])
print ("Xtest shape:", X_test.shape[0])
print ("ytest shape:", X_test.shape[1])

Xtrain shape: 188318
ytrain shape: 117
Xtest shape: 125546
ytest shape: 117


## Mean absolute error (MAE) 
- The models in this project use the mean absolute error (MAE) between the predicted loss and the actual loss for each claim in the test set.
- The goal was to minimize the MAE in our model’s predictions. 

In [21]:
# Custom eval metric
def eval_error(preds, dtrain):
    """evaluation"""
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

---

## K-Folds Cross Validation
KFold divides all the samples in  groups of samples, called folds, of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.

In [22]:
# replicate the results
random_state = 16

# folds
k = 5

---

## Function Tranning
- The traning and test it`s in function because garanted reuse.
- The predictions running in validation set in each fold.
- Garanted with array is the inverse of the log transformation about column loss.
- Calculate time
- Calculate MAE

In [23]:
def train_model(model, num_folds):
    """Function by Train model"""
        
    print("Begin training")
    start = time.time()
    
    # declare a KFold instance
    kfold = KFold(n_splits = num_folds, random_state = 10)
    
    # number of models
    num_models = 1
    
    # array to store results after each fold
    results = np.zeros((X_test.shape[0], k))
        
    # train K-1 Random Forests
    for i, (train, val) in enumerate(kfold.split(X_train)):
        # get smaller training set and create validation set
        X_train_mini, X_val = X_train.iloc[train], X_train.iloc[val]
        y_train_mini, y_val = y_train[train], y_train[val]

        # train model
        model.fit(X_train_mini, y_train_mini)

        # make predictions 
        preds = model.predict(X_val)
        
        # absolute error
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(preds) - shift)
        print("MAE on fold {} is {}".format(i, error))

        
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test)) - shift
        
        # Sum predictions
        results[:,i] = test_predictions
        

    end = time.time()
    print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Error over k folds
    avg_error = np.mean(results)

    return test_predictions

---

## Benchmarks

We will test and execute the models:
- Linear Regression
- Random Forest (Bagging)
- XGBoost

### Linear Regression

In [24]:
# Visualize params
LinearRegression()

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
# Linear Regression
model_lr = LinearRegression(n_jobs=-1, normalize=False)

# training
result_lr = train_model(model = model_lr, num_folds = 5)

Begin training
MAE on fold 0 is 1281.4420446956474
MAE on fold 1 is 1275.6411263322877
MAE on fold 2 is 1294.5456052336367
MAE on fold 3 is 1294.5150507310548
MAE on fold 4 is 1274.0273135089753

Training done! Time Elapsed: 6.597168207168579  seconds.


In [26]:
# Linear Regression normalized
model_lr_normalized = LinearRegression(n_jobs=-1, normalize=True)

# training
result_lr_normalized = train_model(model = model_lr_normalized, num_folds = 5)

Begin training
MAE on fold 0 is 1281.442044695646
MAE on fold 1 is 1275.6411263322886
MAE on fold 2 is 1294.545605233637
MAE on fold 3 is 1294.515050731053
MAE on fold 4 is 1274.0273135089753

Training done! Time Elapsed: 6.364207744598389  seconds.


#### Analysis of Results
- Without difference data normalized and not normalized
- The best result MAE on fold 4: 1267.692474560776

#### Submission

In [27]:
save_predictions(ids = test_ids, 
                 predictions = result_lr_normalized, 
                 file = submissions_folder + 'lin_regression_submission.csv')

Data storage!


#### View file

In [28]:
sub = pd.read_csv(submissions_folder + 'lin_regression_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1389.550974
1,6,1748.389852
2,9,12446.009133
3,12,4313.445745
4,15,737.833176


### Random Forest

#### Process trainning
- The number of estimators go is tested.
- Update the number of estimators to 20, 50 and 100 to see how the model performs. 

In [29]:
# Visualize params
RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [30]:
# 1st model Random Forest
rf_regressor_one = RandomForestRegressor(n_jobs = -1)

# training
result_rf_1 = train_model(model = rf_regressor_one, num_folds = 3)

Begin training
MAE on fold 0 is 1267.2518037075408
MAE on fold 1 is 1276.3386967822255
MAE on fold 2 is 1270.182472682407

Training done! Time Elapsed: 54.89362454414368  seconds.


In [31]:
# 2st model Random Forest
rf_regressor_two = RandomForestRegressor(n_estimators = 50, 
                                         n_jobs = -1,
                                         max_depth = 30)

# training
result_rf_2 = train_model(model = rf_regressor_two, num_folds = 3)

Begin training
MAE on fold 0 is 1221.9018090436953
MAE on fold 1 is 1227.886788877361
MAE on fold 2 is 1218.6630924167218

Training done! Time Elapsed: 239.89357542991638  seconds.


In [32]:
# 3st model Random Forest
rf_regressor_three = RandomForestRegressor(n_estimators = len(df_train.columns), 
                                          n_jobs = -1,
                                          verbose = 1, 
                                          max_depth = len(df_train.columns))

# training
result_rf_3 = train_model(model = rf_regressor_three, num_folds = 5)

Begin training


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    1.1s finished


MAE on fold 0 is 1215.0686034761086


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  4.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    1.1s finished


MAE on fold 1 is 1211.7435547566727


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    1.1s finished


MAE on fold 2 is 1221.3481337226726


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    1.1s finished


MAE on fold 3 is 1222.0775866335407


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    1.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


MAE on fold 4 is 1203.9065668153037


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s



Training done! Time Elapsed: 1192.394330739975  seconds.


[Parallel(n_jobs=4)]: Done 118 out of 118 | elapsed:    4.1s finished


#### Analysis of Results
- The increasing the number of estimators improved score.
- The best results produced with n_estimators = 100 on fold 1.
- The best result MAE on fold 1 is 1211.7435547566727
- The problem is time process which grows in the same proportion. 
- The model random forest were worse than linear regression

#### Submission

In [33]:
save_predictions(ids = test_ids, 
                 predictions = result_rf_3, 
                 file = submissions_folder + 'random_forest_submission.csv')

Data storage!


#### View submission

In [34]:
sub = pd.read_csv(submissions_folder + 'random_forest_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1895.00752
1,6,1781.783073
2,9,8384.960786
3,12,5303.408308
4,15,698.556212


### XGBoost
- Model very robust
- descentent gradient 
- regularization parameter: help avoid overfitting
- parallelizable

The model XGBRegressor will be trained with threee model using difference parameters.

#### Otimize XGBoost

DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.

In [35]:
# Data Matrix used in XGBoost.
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

#### Function  train_test_xgboost
- The function process data to:
 - calculate time process
 - shuffle the data during each fold
 - run predictions
 - store these predictions in a numpy array
 - average the predictions over k number of folds.

In [36]:
def train_test_xgboost(model, early_stopping_rounds):
    kf = KFold(n_splits = k, shuffle = True, random_state = random_state)
    results = np.zeros((X_test.shape[0], k))
    
    print("Begin training")
    start = time.time()
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print("Begin training and testing base model on fold {}".format(i))
        start = time.time()
        
        X_train_mini, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_mini, y_val = y_train[train_index], y_train[val_index]

        # train model
        model.fit(X_train_mini, 
                   y_train_mini, 
                   eval_metric = eval_error, 
                   eval_set = [(X_train_mini, y_train_mini), (X_val, y_val)], 
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)

        end = time.time()
        print("Training time elapsed on fold {} is {}".format(i, end - start))
        
        # Predict on validation set 
        val_predictions = model.predict(X_val, ntree_limit = model.best_ntree_limit)
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(val_predictions) - shift)
        print("Error on fold {} is {} \n".format(i, error))
                
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test, ntree_limit = model.best_ntree_limit)) - shift
        # Sum predictions
        results[:,i] = test_predictions
        
        end = time.time()
        print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Average predictions
    mean_results = results.mean(axis = 1)
    return mean_results

In [37]:
# Visualize params
XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

#### Chooce parameters
It´s possible reduce error and overfitting with analysis of parameters. Here is a list of the key parameters below:

- max_depth - Max tree depth for boosted trees
- gamma - Minimum loss reduction required to make a further partition on a leaf node of the tree.
- min_child_weight - Minimum sum of instance weight(hessian) needed in a child.

**NOTE**: keep someone parameters default, how by example, learning_rate=0.1.<br/>
**NOTE about n_estimators**: when tested there aren´t effect

In [38]:
# Model 1 XGB_regressor
xgb_one = XGBRegressor(nthread = -1)

# training
results_xgb_1 = train_test_xgboost(model = xgb_one,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 59.42958927154541
Error on fold 0 is 1205.526984271663 


Training done! Time Elapsed: 60.773072242736816  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 55.13726449012756
Error on fold 1 is 1211.4545332831656 


Training done! Time Elapsed: 56.24442672729492  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 54.75081276893616
Error on fold 2 is 1214.842840666143 


Training done! Time Elapsed: 55.876768827438354  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 55.222468852996826
Error on fold 3 is 1228.4521383554118 


Training done! Time Elapsed: 56.426044940948486  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 57.63573145866394
Error on fold 4 is 1219.1052827490441 


Training done! Time Elapsed: 58.7653987407684

In [39]:
# Model 2 XGB_regressor
xgb_two = XGBRegressor(learning_rate=0.1,
                       n_estimators = 1000,
                       max_depth = 5,
                       min_child_weight = len(df_train.columns),
                       gamma = 1,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       reg_alpha = 1.0,
                       silent = True, 
                       seed = random_state, 
                       nthread = -1)
# training
results_xgb_2 = train_test_xgboost(model = xgb_two,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 281.79953169822693
Error on fold 0 is 1147.2809412993995 


Training done! Time Elapsed: 283.86189556121826  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 258.6547632217407
Error on fold 1 is 1150.0978823402681 


Training done! Time Elapsed: 260.6194396018982  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 284.5122694969177
Error on fold 2 is 1149.3315641811243 


Training done! Time Elapsed: 286.6539566516876  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 232.017915725708
Error on fold 3 is 1161.9972366624143 


Training done! Time Elapsed: 233.91067671775818  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 236.13847064971924
Error on fold 4 is 1157.7847057399877 


Training done! Time Elapsed: 238.03281569480

In [40]:
# Model 3 XGB_regressor
xgb_three = XGBRegressor(learning_rate=0.1,
                        n_estimators = 1000,
                        max_depth = 9,
                        min_child_weight = 6,
                        gamma = 1,
                        subsample = 1.0,
                        colsample_bytree = 0.5,
                        reg_alpha = 1.0,
                        silent = True, 
                        seed = random_state, 
                        nthread = -1)

# training
results_xgb_3 = train_test_xgboost(model = xgb_three,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 215.12440252304077
Error on fold 0 is 1140.7930736046517 


Training done! Time Elapsed: 217.5553958415985  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 280.02099227905273
Error on fold 1 is 1142.6189198301279 


Training done! Time Elapsed: 283.00370383262634  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 292.0481905937195
Error on fold 2 is 1144.213848548053 


Training done! Time Elapsed: 295.2645125389099  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 270.60906958580017
Error on fold 3 is 1150.918809123673 


Training done! Time Elapsed: 273.5893156528473  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 236.3326814174652
Error on fold 4 is 1150.0613693268986 


Training done! Time Elapsed: 238.986648082733

#### Submission

In [41]:
save_predictions(ids = test_ids, 
                 predictions = results_xgb_1, 
                 file = submissions_folder + 'xgb_submission.csv')

Data storage!


#### View submission

In [42]:
sub = pd.read_csv(submissions_folder + 'xgb_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1750.100195
1,6,1831.237549
2,9,8265.201563
3,12,4871.358496
4,15,960.504199


#### Analysis of Results
- The XGBoost have better performace in comparain Random Forest and Linear Regression
- The best result MAE on fold 0 is 1140.7930736046517 in three model