# Tabular Playground - Scikit-Learn Ensemble

The competition description says this is a linear regression problem, so lets see what Scikit Learn can do
- https://scikit-learn.org/stable/modules/linear_model.html#linear-model

In [None]:
import numpy as np 
import pandas as pd
import sklearn
from operator import itemgetter
from sklearn.linear_model import *
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import pprint

pp = pprint.PrettyPrinter(width=41, compact=True)

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv', index_col='id')
test_df  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv',  index_col='id')

# display('train_df')
display( train_df )
# display('test_df')
# display( test_df )

# Preprocessing

Adding PolynomialFeatures actually reduces the score, so this is indeed a pure linear regression problem
- `degree=2` = `0.76347`
- `degree=1` = `0.72935`

In [None]:
# DOCS: https://scikit-learn.org/stable/modules/preprocessing.html
# DOCS: https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions
def preprocess_X(X, degree=1):
    # NOTE: PolynomialFeatures needs to be done before scaling - https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
    X = PolynomialFeatures(degree=degree).fit_transform(X)
    X = StandardScaler().fit_transform(X)
    return X

columns = test_df.columns
X_test  = preprocess_X( test_df[columns]  )
X       = preprocess_X( train_df[columns] )
Y       = train_df['target']
X_train, X_valid, Y_train, Y_valid = sklearn.model_selection.train_test_split(X, Y, test_size=0.05, random_state=42)

print('X_train.shape', X_train.shape)
print('Y_train.shape', Y_train.shape)

# Linear Models

Lets try out each of the Linear Models from scikit-learn in a loop
- https://scikit-learn.org/0.20/modules/classes.html#module-sklearn.linear_model

In [None]:
# Lazy Load models to avoid out of memory errors
models = [
    (sklearn.linear_model.ARDRegression, {}),          
    # ([n_iter, tol, …])	Bayesian ARD regression.
    
    (sklearn.linear_model.BayesianRidge, {}),          
    # ([n_iter, tol, …])	Bayesian ridge regression
    
    (sklearn.linear_model.ElasticNet, {}),             
    # ([alpha, l1_ratio, …])	Linear regression with combined L1 and L2 priors as regularizer.
    
    # (sklearn.linear_model.ElasticNetCV, {"max_iter":10_000}),           
    # ([l1_ratio, eps, …])	Elastic Net model with iterative fitting along a regularization path.
    
    (sklearn.linear_model.HuberRegressor, {"max_iter":1000}),         
    # ([epsilon, …])	Linear regression model that is robust to outliers.
    
    (sklearn.linear_model.Lars, {}),                   
    # ([fit_intercept, verbose, …])	Least Angle Regression model a.k.a.
    
    (sklearn.linear_model.LarsCV, {}),                 
    # ([fit_intercept, …])	Cross-validated Least Angle Regression model.
    
    (sklearn.linear_model.Lasso, {}),                  
    # ([alpha, fit_intercept, …])	Linear Model trained with L1 prior as regularizer (aka the Lasso)
    
    (sklearn.linear_model.LassoCV, {"max_iter":10_000}), 
    # ([eps, n_alphas, …])	Lasso linear model with iterative fitting along a regularization path.
    
    (sklearn.linear_model.LassoLars, {}),              
    # ([alpha, …])	Lasso model fit with Least Angle Regression a.k.a.
    
    (sklearn.linear_model.LassoLarsCV, {}),            
    # ([fit_intercept, …])	Cross-validated Lasso, using the LARS algorithm.
    
    (sklearn.linear_model.LassoLarsIC, {}),            
    # ([criterion, …])	Lasso model fit with Lars using BIC or AIC for model selection
    
    (sklearn.linear_model.LinearRegression, {}),       
    # ([…])	Ordinary least squares Linear Regression.  
    
    # (sklearn.linear_model.LogisticRegression, {}),     
    # ([penalty, …])	Logistic Regression (aka logit, MaxEnt) classifier.
    
    # (sklearn.linear_model.LogisticRegressionCV, {}),   
    # ([Cs, …])	Logistic Regression CV (aka logit, MaxEnt) classifier.
    
    # (sklearn.linear_model.MultiTaskLasso, {}),         
    # ([alpha, …])	Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
    
    # (sklearn.linear_model.MultiTaskElasticNet, {}),    
    # ([alpha, …])	Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer
    
    # (sklearn.linear_model.MultiTaskLassoCV, {}),       
    # ([eps, …])	Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
    
    # (sklearn.linear_model.MultiTaskElasticNetCV, {}),  
    # ([…])	Multi-task L1/L2 ElasticNet with built-in cross-validation.
    
    (sklearn.linear_model.OrthogonalMatchingPursuit, {}),    
    # ([…])	Orthogonal Matching Pursuit model(OMP)
    
    (sklearn.linear_model.OrthogonalMatchingPursuitCV, {}),  
    # ([…])	Cross-validated Orthogonal Matching Pursuit model (OMP).
    
    # (sklearn.linear_model.PassiveAggressiveClassifier, {}),  
    # ([…])	Passive Aggressive Classifier
    
    (sklearn.linear_model.PassiveAggressiveRegressor, {}),   
    # ([C, …])	Passive Aggressive Regressor # Poor Score
    
    # (sklearn.linear_model.Perceptron, {}),             
    # ([penalty, alpha, …])	Read more in the User Guide.
    
    (sklearn.linear_model.RANSACRegressor, {}),        
    # ([…])	RANSAC (RANdom SAmple Consensus) algorithm.  # Poor Score
    
    (sklearn.linear_model.Ridge, {}),                  
    # ([alpha, fit_intercept, …])	Linear least squares with l2 regularization.
    
    # (sklearn.linear_model.RidgeClassifier, {}),        
    # ([alpha, …])	Classifier using Ridge regression.
    
    # (sklearn.linear_model.RidgeClassifierCV, {}),      
    # ([alphas, …])	Ridge classifier with built-in cross-validation.
    
    (sklearn.linear_model.RidgeCV, {}),                
    # ([alphas, …])	Ridge regression with built-in cross-validation.
    
    # (sklearn.linear_model.SGDClassifier, {}),          
    # ([loss, penalty, …])	Linear classifiers(SVM, logistic regression, a.o.) with SGD training.
    
    (sklearn.linear_model.SGDRegressor, {}),           
    # ([loss, penalty, …])	Linear model fitted by minimizing a regularized empirical loss with SGD
    
    # (sklearn.linear_model.TheilSenRegressor, {}),      
    # ([…])	Theil-Sen Estimator: robust multivariate regression model.  # Cause OOM Exception
    
    # sklearn.linear_model.enet_path(X_train, Y_train),           
    # (X, y[, l1_ratio, …])	Compute elastic net path with coordinate descent
    
    # sklearn.linear_model.lars_path(X_train, Y_train),           
    # (X, y[, Xy, Gram, …])	Compute Least Angle Regression or Lasso path using LARS algorithm [1]
    
    # sklearn.linear_model.lasso_path(X_train, Y_train),          
    # (X, y[, eps, …])	Compute Lasso path with coordinate descent
    
    # sklearn.linear_model.logistic_regression_path(X_train, Y_train),  
    # (X, y)	Compute a Logistic Regression model for a list of regularization parameters.
    
    # sklearn.linear_model.orthogonal_mp(X_train, Y_train),       
    # (X, y[, …])	Orthogonal Matching Pursuit(OMP)
    
    # sklearn.linear_model.orthogonal_m_gram(X_train, Y_train),  
    # (Gram, Xy[, …])	Gram Orthogonal Matching Pursuit(OMP)
    
    # sklearn.linear_model.ridge_regression(X_train, Y_train),    
    # (X, y, alpha[, …])	Solve the ridge equation by the method of normal equations.
]

In [None]:
%%time
def fit_predict(model_class, kwargs, verbose=True):
    name  = model_class.__name__ 
    if verbose: print(name)
    model = model_class(**kwargs)  
    model.fit(X_train, Y_train)
    rmse       = sklearn.metrics.mean_squared_error(Y_valid, model.predict(X_valid), squared=False)
    prediction = model.predict(X_test)
    return name, rmse, prediction 


scores      = {}
predictions = {}
for model_class, kwargs in models:
    try:
        name, rmse, prediction = fit_predict(model_class, kwargs)
        scores[name]      = rmse
        predictions[name] = prediction
    except:
        print('ERROR', model_class.__name__)

In [None]:
scores = dict(sorted(scores.items(), key=itemgetter(1), reverse=False))
scores

# Ensemble

In [None]:
Y_test = np.mean([
    predictions[name]
    for name, score in scores.items()
    if score <= 0.8
], axis=0)
print('Y_test.shape', Y_test.shape)

# Submission

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv', index_col='id')
submission_df['target'] = Y_test
submission_df.to_csv('submission.csv')
!head submission.csv

# Further Reading

This notebook is part of a series exploring the [Tabular Playground](https://www.kaggle.com/c/tabular-playground-series-jan-2021)
- 0.72935 - [scikit-learn Ensemble](https://www.kaggle.com/jamesmcguigan/tabular-playground-scikit-learn-ensemble)
- 0.71423 - [Fast.ai Tabular Solver](https://www.kaggle.com/jamesmcguigan/fast-ai-tabular-solver)
- 0.70426 - [XGBoost](https://www.kaggle.com/jamesmcguigan/tabular-playground-xgboost)