In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

%pip install mlxtend

In [2]:
df = pd.read_csv("Life Expectancy Data.csv")

In [3]:
y = df["Life expectancy "]
X =  df.drop(["Life expectancy "], axis=1)

In [4]:
y.fillna(y.median(), inplace=True)

In [5]:
X.Year = pd.to_datetime(X.Year).dt.year

In [6]:
bin_enc = ce.BinaryEncoder(drop_invariant=True)
X = bin_enc.fit_transform(X) 

In [7]:
X.fillna(X.mean(), inplace=True)

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.30, random_state=9)

## Random fores (bagging group)

Several independent models are created based on the same algorithm, but on different datasets, using a random subsample from a common dataset. Then the predictions of all these models are combined into one common prediction by averaging or voting.

In [9]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
prediction = rfr.predict(X_test)

In [10]:
params = { 
 'max_depth' : [10, 15, 20],
}

grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid=params, cv= 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [10, 15, 20]})

In [11]:
grid.best_score_, grid.best_params_

(0.9560259403650033, {'max_depth': 20})

In [12]:
rfr = RandomForestRegressor(max_depth=15, random_state=0)
rfr.fit(X_train, y_train)
prediction = rfr.predict(X_test)
mean_squared_error(y_test, prediction)

3.780427824778443

## Boosting group

Sequential training of weak models to create a strong model. In boosting, each model learns from a data set that has been adjusted based on the errors of the previous model. Thus, each subsequent model focuses on those objects on which the previous model made a mistake, and tries to correct the errors. As a result, we get a strong model that can give more accurate forecasts than each of the weak models individually.

In [13]:
params = {
    'n_estimators': [500, 800],
    'max_depth': [5, 8],
    'min_samples_split': [2, 5],
    'learning_rate': [0.01, 0.1]
}
grid = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1], 'max_depth': [5, 8],
                         'min_samples_split': [2, 5],
                         'n_estimators': [500, 800]})

In [14]:
grid.best_score_, grid.best_params_

(0.9572816094106351,
 {'learning_rate': 0.1,
  'max_depth': 5,
  'min_samples_split': 5,
  'n_estimators': 800})

In [15]:
gbr = GradientBoostingRegressor(**grid.best_params_)
gbr.fit(X_train, y_train)
prediction = gbr.predict(X_test)

In [16]:
mean_squared_error(y_test, prediction)

3.428038590524013

## Stacking group

We train several models on a training dataset, then use them to create predictions on a test dataset. Then we collect predictions from the underlying models and use them as input to the meta-model. The meta-model is trained on this data to improve the performance of the model.

In [17]:
reg1 = RandomForestRegressor(random_state=42)
reg2 = GradientBoostingRegressor(random_state=42)
reg3 = LinearRegression()
meta_learner = LinearRegression()
sr = StackingCVRegressor(regressors=[reg1, reg2, reg3], meta_regressor=meta_learner)
params = {
    'randomforestregressor__max_depth': [5, 8]
}

grid = GridSearchCV(estimator=sr, param_grid=params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=StackingCVRegressor(meta_regressor=LinearRegression(),
                                           regressors=[RandomForestRegressor(random_state=42),
                                                       GradientBoostingRegressor(random_state=42),
                                                       LinearRegression()]),
             n_jobs=-1,
             param_grid={'randomforestregressor__max_depth': [5, 8]})

In [18]:
sr.get_params().keys()

dict_keys(['cv', 'meta_regressor__copy_X', 'meta_regressor__fit_intercept', 'meta_regressor__n_jobs', 'meta_regressor__normalize', 'meta_regressor__positive', 'meta_regressor', 'multi_output', 'n_jobs', 'pre_dispatch', 'random_state', 'refit', 'regressors', 'shuffle', 'store_train_meta_features', 'use_features_in_secondary', 'verbose', 'randomforestregressor', 'gradientboostingregressor', 'linearregression', 'randomforestregressor__bootstrap', 'randomforestregressor__ccp_alpha', 'randomforestregressor__criterion', 'randomforestregressor__max_depth', 'randomforestregressor__max_features', 'randomforestregressor__max_leaf_nodes', 'randomforestregressor__max_samples', 'randomforestregressor__min_impurity_decrease', 'randomforestregressor__min_samples_leaf', 'randomforestregressor__min_samples_split', 'randomforestregressor__min_weight_fraction_leaf', 'randomforestregressor__n_estimators', 'randomforestregressor__n_jobs', 'randomforestregressor__oob_score', 'randomforestregressor__random_s

In [19]:
grid.best_score_, grid.best_params_

(0.9501129846734061, {'randomforestregressor__max_depth': 8})

In [20]:
reg1 = RandomForestRegressor(random_state=42, max_depth=8)

In [21]:
sr = StackingCVRegressor(regressors=[reg1, reg2, reg3], meta_regressor=meta_learner)
sr.fit(X_train, y_train)
prediction = sr.predict(X_test)



In [22]:
mean_squared_error(y_test, prediction)

4.43231527722557