# Splines + LGMRegressor Ensemble Model

Here I will clean the data (as shown in more detail here https://www.kaggle.com/tjcdev/baselines-with-eda-and-feature-engineering/edit/run/54393091)

Before creating an ensemble model and submitting a prediction.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from keras.models import Sequential
from keras.layers import Dense

from lightgbm import LGBMRegressor
!pip install sklearn-contrib-py-earth
from pyearth import Earth

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
        
# Hide warnings
import warnings
warnings.filterwarnings('ignore')
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the Data

**Note:** I preprocessed the data in the notebook linked at the top of this notebook. 

In [None]:
train_df = pd.read_csv('../input/preprocessed-tabular-playground-series-february/preprocessed_train.csv')
test_df = pd.read_csv('../input/preprocessed-tabular-playground-series-february/preprocessed_test.csv')
display(train_df.head())
train_df.describe()
print(train_df.columns)

In [None]:
X = train_df.drop(columns=['target'])
y = train_df['target']

In [None]:
cat_features = [feature for feature in X.columns if 'ordinal_' in feature]

# Hyperparameter Tuning

### LGBM Regressor

In [None]:
PARAMS={'learning_rate': [0.01, 0,1, 0.4],
         'max_depth': [5, 10, 20],
         'num_leaves': [10, 15, 25],
         'feature_fraction': [0.5, 0.8, 0.9],
         'subsample': [0.1, 0.2, 0.4]}



lgbmr = LGBMRegressor(random_state=42, 
                      objective='regression', 
                      metric='mean_absolute_error', 
                      boosting='gbdt',
                      num_boost_round=300)

gs = RandomizedSearchCV(
    estimator=lgbmr, 
    param_distributions=PARAMS, 
    n_iter = 100, 
    cv = 2, 
    scoring='neg_mean_absolute_error',
    verbose=10, 
    random_state=42)

#gs.fit(X, y)
#print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

**From Previous Run**

Best score reached: -0.7168645242498474 with params: {'subsample': 0.2, 'num_leaves': 10, 'max_depth': 5, 'learning_rate': 0.4, 'feature_fraction': 0.5} 

### Splines

In [None]:
PARAMS = {
    'max_terms': [400, 200, 100], 
    'max_degree': [1, 2],
    'penalty': [2.5, 3.0, 3.5]
}

earth = Earth(allow_missing=True, use_fast=True)

gs = RandomizedSearchCV(
    estimator=earth, 
    param_distributions=PARAMS, 
    n_iter = 100, 
    cv = 2, 
    scoring='neg_mean_absolute_error',
    verbose=10, 
    random_state=42)

##gs.fit(X, y)
#print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

**From Previous Run**

Best score reached: -0.7217770759004736 with params: {'penalty': 2.5, 'max_terms': 400, 'max_degree': 2} 

# Ensemble Model

### Averaging Results

In [None]:
# Test the non-PCA features
kf = KFold(n_splits=2)

scores = []
for train_index, test_index in kf.split(train_df):
    
    train_X = X.iloc[train_index, :]
    test_X = X.iloc[test_index, :]
    
    train_target = y.iloc[train_index]
    test_target = y.iloc[test_index]
    
    # LGBM Regressor
    lgbmr = LGBMRegressor(random_state=42, 
                          objective='regression', 
                          metric='mean_absolute_error',
                          subsample=0.2, 
                          num_leaves=10, 
                          max_depth=5, 
                          learning_rate=0.4, 
                          feature_fraction=0.5)
    lgbmr.fit(train_X, 
              train_target, 
              eval_set=[(test_X, test_target)], 
              verbose=False)
    lgbmr_preds = lgbmr.predict(test_X)

    # Earth Splines
    earth = Earth(allow_missing=True,
                  penalty=2.5, 
                  max_terms=400, 
                  max_degree=2)
    earth.fit(train_X, train_target)

    earth_preds = earth.predict(test_X)
    
    # Average the predictions
    ensemble_preds = (lgbmr_preds + earth_preds) / 2
    
    score = mean_squared_error(test_target, ensemble_preds)
    print('mean_squared_error: ', score)
    scores.append(score)

print('The average mean_squared_error: ', np.mean(scores))
    


### Neural Network    

In [None]:
# Test the non-PCA features
kf = KFold(n_splits=2)

scores = []
for train_index, test_index in kf.split(train_df):
    train_X = X.iloc[train_index, :]
    test_X = X.iloc[test_index, :]

    train_target = y.iloc[train_index]
    test_target = y.iloc[test_index]

    # LGBM Regressor
    lgbmr = LGBMRegressor(random_state=42, 
                          objective='regression', 
                          metric='mean_absolute_error',
                          subsample=0.2, 
                          num_leaves=10, 
                          max_depth=5, 
                          learning_rate=0.4, 
                          feature_fraction=0.5)
    lgbmr.fit(train_X, 
              train_target, 
              eval_set=[(test_X, test_target)], 
              verbose=False)
    lgbmr_preds = lgbmr.predict(test_X)

    # Earth Splines
    earth = Earth(allow_missing=True,
                  penalty=2.5, 
                  max_terms=400, 
                  max_degree=2)
    earth.fit(train_X, train_target)
    earth_preds = earth.predict(test_X)

    # Ensemble the two predictions
    combined_preds = np.vstack((lgbmr_preds,earth_preds)).T
    reg = LinearRegression().fit(combined_preds, test_target)

    ensemble_preds = reg.predict(combined_preds)

    score = mean_squared_error(test_target, ensemble_preds)
    print('mean_squared_error: ', score)
    scores.append(score)

print('The average mean_squared_error: ', np.mean(scores))
    


# Submit Predictions

From above we can actually see that the best performing ensemble model is a simple average of the two models. And so we can make our predictions using this model and submit it.

In [None]:
train_X, test_X, train_target, test_target = train_test_split(X, y, test_size=0.2, random_state=42)


# LGBM Regressor
lgbmr = LGBMRegressor(random_state=42, 
                      objective='regression', 
                      metric='mean_absolute_error',
                      subsample=0.2, 
                      num_leaves=10, 
                      max_depth=5, 
                      learning_rate=0.4, 
                      feature_fraction=0.5)
lgbmr.fit(train_X, 
          train_target, 
          eval_set=[(test_X, test_target)], 
          verbose=False)

lgbmr_preds = lgbmr.predict(test_X)

# Earth Splines
earth = Earth(allow_missing=True,
              penalty=2.5, 
              max_terms=400, 
              max_degree=2)
earth.fit(train_X, train_target)
earth_preds = earth.predict(test_X)

# Ensemble the two predictions
combined_preds = np.vstack((lgbmr_preds,earth_preds)).T
reg = LinearRegression().fit(combined_preds, test_target)

ensemble_preds = reg.predict(combined_preds)

score = mean_squared_error(test_target, ensemble_preds)
print('mean_squared_error: ', score)

In [None]:
test_X = test_df.drop(columns=['id'])

lgbmr_preds = lgbmr.predict(test_X)
earth_preds = earth.predict(test_X)
combined_preds = np.vstack((lgbmr_preds, earth_preds)).T
ensemble_preds = reg.predict(combined_preds)

my_submission = pd.DataFrame({'id': test_df.id, 'target': ensemble_preds})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)