In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# 1. loading data

In [None]:
battery_df = pd.read_csv("/kaggle/input/battery-remaining-useful-life-rul/Battery_RUL.csv")
print(battery_df.shape)
battery_df.head()


# 2. data description
- 2.8Ah NMC-LCO 18650 battery * 14ea
- cycling test over 1000 times at 25°C with 0.5C charge and 1.5C discharge rate
- Cycle Index: number of cycle
- Discharge Time (s)
- Decrement 3.6-3.4V (s)
- Max. Voltage Discharge (V)
- Min. Voltage Charge (V)
- Time at 4.15V (s)
- Time Constant Current (s)
- Charging Time (s)
- RUL: Remaining Useful Time, Target
ref https://github.com/ignavinuales/Battery_RUL_Prediction?tab=readme-ov-file

In [None]:
battery_df.info()
# no missing data

In [None]:
battery_df.describe()

# 3. EDA

In [None]:
plt.figure(figsize=(4, 3))
plt.title('RUL, Remaining Useful Time Histogram')
sns.histplot(battery_df.RUL, kde=True)
plt.show()
# it is not a normal distribution.

In [None]:
# data distribution

fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(12, 18))

sns.distplot(battery_df['RUL'], ax=ax[0, 0]) # target
sns.distplot(battery_df['Discharge Time (s)'], ax=ax[0, 1])
sns.distplot(battery_df['Decrement 3.6-3.4V (s)'], ax=ax[0, 2])
sns.distplot(battery_df['Max. Voltage Dischar. (V)'], ax=ax[1, 0])
sns.distplot(battery_df['Min. Voltage Charg. (V)'], ax=ax[1, 1])
sns.distplot(battery_df['Time at 4.15V (s)'], ax=ax[1, 2])
sns.distplot(battery_df['Time constant current (s)'], ax=ax[2, 0])
sns.distplot(battery_df['Charging time (s)'], ax=ax[2, 1])
plt.show()

In [None]:
# correlation

corrmat = battery_df.corr()
plt.figure(figsize=(6, 6))
k=9
cols = corrmat.nlargest(k, 'RUL')['RUL'].index
cm = np.corrcoef(battery_df[cols].values.T)
sns.set(font_scale=1.4)
hm = sns.heatmap(cm, cbar=False, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, 
                 yticklabels=cols.values, xticklabels=cols.values,cmap="rainbow")
hm.xaxis.tick_top()
plt.xticks(rotation=45, ha='left')
plt.show()

In [None]:
plt.figure(figsize=(30, 30))
sns.set(font_scale=1.0)
sns.pairplot(battery_df[['RUL', 'Max. Voltage Dischar. (V)', 'Time at 4.15V (s)', 
                         'Time constant current (s)', 'Charging time (s)', 'Decrement 3.6-3.4V (s)', 
                         'Discharge Time (s)', 'Min. Voltage Charg. (V)']], 
             diag_kind='kde', kind='reg')
plt.show()

In [None]:
# interactive graph
# pip install plotly

In [None]:
# import plotly, interactive graph
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [None]:
# interactive histogram of RUL

import plotly.express as px
fig = px.histogram(data_frame = battery_df, x="RUL", nbins=240)
fig.show()
# 1st node 220~224 -> 222, 2nd node 735~739 -> 737

In [None]:
# divide the whole data to make a normal dist.
battery_1st = battery_df[(battery_df['RUL']<=222)]
battery_2nd = battery_df[(battery_df['RUL']>222)&(battery_df['RUL']<=737)]
battery_3rd = battery_df[(battery_df['RUL']>737)]
print(battery_1st.shape, battery_2nd.shape, battery_3rd.shape)
battery_1st.describe()

In [None]:
fig, ax = plt.subplots(ncols=3,figsize=(16, 3))

sns.histplot(battery_1st.RUL, kde=True, bins=30, ax=ax[0])
sns.histplot(battery_2nd.RUL, kde=True, bins=30, ax=ax[1])
sns.histplot(battery_3rd.RUL, kde=True, bins=30, ax=ax[2])

plt.show()

In [None]:
fig, ax = plt.subplots(ncols=3,figsize=(16, 3))

sns.regplot(battery_1st["RUL"], battery_1st['Max. Voltage Dischar. (V)'], ax=ax[0])
sns.regplot(battery_2nd["RUL"], battery_2nd['Max. Voltage Dischar. (V)'], ax=ax[1])
sns.regplot(battery_3rd["RUL"], battery_3rd['Max. Voltage Dischar. (V)'], ax=ax[2])
plt.show()

# 4. modeling with whole data

In [None]:
target = battery_df['RUL']
feature = battery_df.drop(['RUL', 'Cycle_Index'], axis=1)

target.shape, feature.shape

In [None]:
# standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
feature_std = scaler.fit_transform(feature)

feature_std = pd.DataFrame(feature_std, columns = feature.columns)
feature_std.head()

In [None]:
# split into train and test data
from sklearn.model_selection import (train_test_split, StratifiedKFold)
X_train, X_test, y_train, y_test = train_test_split(feature_std, target, test_size=0.2, random_state=2404)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='mean'))])
pipeline

In [None]:
def prepare_model(algorithm, X_train, y_train): 
    model = Pipeline(steps=[('preprocessing', pipeline),('algorithm', algorithm)])
    model.fit(X_train, y_train)
    return model

In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error

algorithms = [RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), 
              BaggingRegressor(), SVR(), DecisionTreeRegressor(), ExtraTreeRegressor(), 
              LinearRegression(), SGDRegressor(), KNeighborsRegressor()]

In [None]:
names = []
times = []
mse = []
rmse = []

for algorithm in algorithms:
    name = type(algorithm).__name__
    names.append(name)
    start_time = time.time()
    model = prepare_model(algorithm, X_train, y_train)
    pred = model.predict(X_test)
    end_time = time.time()
    times.append(end_time - start_time)
    mse.append(mean_squared_error(y_test, pred))
    rmse.append(np.sqrt(mean_squared_error(y_test, pred)))

print('Regression Results in Algorithms') 
results_dict = {'Algorithm': names, 'MSE': mse, 'RMSE': rmse, 'Time': times}
pd.DataFrame(results_dict).sort_values(by='RMSE', ascending=1)

# Regression Results in Algorithms
#   Algorithm	                 MSE	    RMSE	    Time
# 0	RandomForestRegressor	     538.368288	23.202765	6.218842
# 3	BaggingRegressor	         611.349974	24.725492	0.624315
# 5	DecisionTreeRegressor	     955.865002	30.917067	0.103306
# 6	ExtraTreeRegressor	         960.425241	30.990728	0.033131
# 9	KNeighborsRegressor	        1287.032924	35.875241	0.110578
# 2	GradientBoostingRegressor	2036.129743	45.123494	2.246836
# 1	AdaBoostRegressor	        5109.004892	71.477303	0.959565
# 4	SVR	                       10621.652715 103.061403	11.036764
# 7	LinearRegression	       19248.315344	138.738298	0.023962
# 8	SGDRegressor	           20069.864748	141.668150	0.051851

# 5. modeling with divided data

In [None]:
target_1st = battery_1st['RUL']
feature_1st = battery_1st.drop(['RUL', 'Cycle_Index'], axis=1)

target_2nd = battery_2nd['RUL']
feature_2nd = battery_2nd.drop(['RUL', 'Cycle_Index'], axis=1)

target_3rd = battery_3rd['RUL']
feature_3rd = battery_3rd.drop(['RUL', 'Cycle_Index'], axis=1)

(target_1st.shape, feature_1st.shape, target_2nd.shape, feature_2nd.shape, 
target_3rd.shape, feature_3rd.shape)

In [None]:
# standardization

feature_1st_std = scaler.fit_transform(feature_1st)
feature_1st_std = pd.DataFrame(feature_1st_std, columns = feature_1st.columns)

feature_2nd_std = scaler.fit_transform(feature_2nd)
feature_2nd_std = pd.DataFrame(feature_2nd_std, columns = feature_2nd.columns)

feature_3rd_std = scaler.fit_transform(feature_3rd)
feature_3rd_std = pd.DataFrame(feature_3rd_std, columns = feature_3rd.columns)

feature_1st_std.head()

In [None]:
# 1. train_1st and test_1st data

X_train_1st, X_test_1st, y_train_1st, y_test_1st = train_test_split(feature_1st_std, target_1st, 
                                                                    test_size=0.2, random_state=2404)
X_train_1st.shape, X_test_1st.shape, y_train_1st.shape, y_test_1st.shape

In [None]:
names = []
times = []
mse = []
rmse = []

for algorithm in algorithms:
    name = type(algorithm).__name__
    names.append(name)
    start_time = time.time()
    model = prepare_model(algorithm, X_train_1st, y_train_1st)
    pred_1st = model.predict(X_test_1st)
    end_time = time.time()
    times.append(end_time - start_time)
    mse.append(mean_squared_error(y_test_1st, pred_1st))
    rmse.append(np.sqrt(mean_squared_error(y_test_1st, pred_1st)))

print('Regression Results in Algorithms') 
results_dict = {'Algorithm': names, 'MSE': mse, 'RMSE': rmse, 'Time': times}
pd.DataFrame(results_dict).sort_values(by='RMSE', ascending=1)

# Regression Results in Algorithms
#   Algorithm	                MSE	        RMSE	    Time
# 0	RandomForestRegressor	    345.004081	18.574285	1.172915
# 3	BaggingRegressor	        414.997253	20.371481	0.124150
# 5	DecisionTreeRegressor	    586.092105	24.209339	0.021806
# 9	KNeighborsRegressor	        728.754868	26.995460	0.013940
# 6	ExtraTreeRegressor	        739.424342	27.192358	0.011303
# 2	GradientBoostingRegressor	803.817260	28.351671	0.451615
# 1	AdaBoostRegressor	       1539.738153	39.239497	0.081682
# 4	SVR	                       2282.026203	47.770558	0.436830
# 7	LinearRegression	       3031.695185	55.060832	0.009026
# 8	SGDRegressor	           3169.724326	56.300305	0.013640

In [None]:
# 2. train_2nd and test_2nd data

X_train_2nd, X_test_2nd, y_train_2nd, y_test_2nd = train_test_split(feature_2nd_std, target_2nd, 
                                                                    test_size=0.2, random_state=2404)
X_train_2nd.shape, X_test_2nd.shape, y_train_2nd.shape, y_test_2nd.shape

In [None]:
names = []
times = []
mse = []
rmse = []

for algorithm in algorithms:
    name = type(algorithm).__name__
    names.append(name)
    start_time = time.time()
    model = prepare_model(algorithm, X_train_2nd, y_train_2nd)
    pred_2nd = model.predict(X_test_2nd)
    end_time = time.time()
    times.append(end_time - start_time)
    mse.append(mean_squared_error(y_test_2nd, pred_2nd))
    rmse.append(np.sqrt(mean_squared_error(y_test_2nd, pred_2nd)))

print('Regression Results in Algorithms') 
results_dict = {'Algorithm': names, 'MSE': mse, 'RMSE': rmse, 'Time': times}
pd.DataFrame(results_dict).sort_values(by='RMSE', ascending=1)

# Regression Results in Algorithms
# 	Algorithm	                 MSE	    RMSE	    Time
# 0	RandomForestRegressor	     390.894638	19.771056	2.675439
# 3	BaggingRegressor	         458.986482	21.423970	0.282624
# 9	KNeighborsRegressor	         633.641583	25.172238	0.066502
# 5	DecisionTreeRegressor	     645.886331	25.414294	0.048098
# 6	ExtraTreeRegressor	         723.349640	26.895160	0.018760
# 2	GradientBoostingRegressor	1175.601208	34.287041	1.022697
# 1	AdaBoostRegressor	        2460.396731	49.602386	0.533053
# 4	SVR	                        4643.163842	68.140765	2.286361
# 7	LinearRegression	        4821.340717	69.435875	0.007912
# 8	SGDRegressor	            5021.980598	70.865934	0.154626

In [None]:
# 3. train_3rd and test_3rd data

X_train_3rd, X_test_3rd, y_train_3rd, y_test_3rd = train_test_split(feature_3rd_std, target_3rd, 
                                                                    test_size=0.2, random_state=2404)
X_train_3rd.shape, X_test_3rd.shape, y_train_3rd.shape, y_test_3rd.shape

In [None]:
names = []
times = []
mse = []
rmse = []

for algorithm in algorithms:
    name = type(algorithm).__name__
    names.append(name)
    start_time = time.time()
    model = prepare_model(algorithm, X_train_3rd, y_train_3rd)
    pred_3rd = model.predict(X_test_3rd)
    end_time = time.time()
    times.append(end_time - start_time)
    mse.append(mean_squared_error(y_test_3rd, pred_3rd))
    rmse.append(np.sqrt(mean_squared_error(y_test_3rd, pred_3rd)))

print('Regression Results in Algorithms') 
results_dict = {'Algorithm': names, 'MSE': mse, 'RMSE': rmse, 'Time': times}
pd.DataFrame(results_dict).sort_values(by='RMSE', ascending=1)

# Regression Results in Algorithms
# 	Algorithm	                 MSE	    RMSE	    Time
# 0	RandomForestRegressor	     57.666043	7.593816	1.861941
# 3	BaggingRegressor	         62.622716	7.913452	0.196703
# 5	DecisionTreeRegressor	    114.413471	10.696423	0.032839
# 6	ExtraTreeRegressor	        193.860865	13.923393	0.014690
# 2	GradientBoostingRegressor	202.137766	14.217516	0.748458
# 9	KNeighborsRegressor	        259.193392	16.099484	0.020833
# 1	AdaBoostRegressor	        483.955516	21.998989	0.388765
# 4	SVR	                       3494.438425	59.113775	1.233212
# 7	LinearRegression	       7432.243440	86.210460	0.007544
# 8	SGDRegressor	           7540.065267	86.833549	0.017617

# 6. model-1, RandomForestRegressor Comparison

In [None]:
# whole data
rfr = RandomForestRegressor(random_state=2404, n_estimators=100)

rfr.fit(X_train, y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test, y_test))

rfr_pred = rfr.predict(X_test)
rfr_rmse = np.sqrt(mean_squared_error(y_test, rfr_pred))
print(rfr_rmse)

In [None]:
# 1. divided data - feature_1st

rfr.fit(X_train_1st, y_train_1st)
print(rfr.score(X_train_1st, y_train_1st))
print(rfr.score(X_test_1st, y_test_1st))

rfr_pred_1st = rfr.predict(X_test_1st)
rfr_rmse_1st = np.sqrt(mean_squared_error(y_test_1st, rfr_pred_1st))
print(rfr_rmse_1st)

In [None]:
# 2. divided data - feature_2nd

rfr.fit(X_train_2nd, y_train_2nd)
print(rfr.score(X_train_2nd, y_train_2nd))
print(rfr.score(X_test_2nd, y_test_2nd))

rfr_pred_2nd = rfr.predict(X_test_2nd)
rfr_rmse_2nd = np.sqrt(mean_squared_error(y_test_2nd, rfr_pred_2nd))
print(rfr_rmse_2nd)

In [None]:
# 3. divided data - feature_3rd

rfr.fit(X_train_3rd, y_train_3rd)
print(rfr.score(X_train_3rd, y_train_3rd))
print(rfr.score(X_test_3rd, y_test_3rd))

rfr_pred_3rd = rfr.predict(X_test_3rd)
rfr_rmse_3rd = np.sqrt(mean_squared_error(y_test_3rd, rfr_pred_3rd))
print(rfr_rmse_3rd)

In [None]:
fig, ax = plt.subplots(ncols=4,figsize=(18, 3))

sns.regplot(x = rfr_pred, y = y_test, ax=ax[0])
sns.regplot(x = rfr_pred_1st, y = y_test_1st, ax=ax[1])
sns.regplot(x = rfr_pred_2nd, y = y_test_2nd, ax=ax[2])
sns.regplot(x = rfr_pred_3rd, y = y_test_3rd, ax=ax[3])

plt.show()

# 7. feature importance

In [None]:
rfr.feature_importances_
sorted_idx = rfr.feature_importances_.argsort()
plt.barh(X_train.columns[sorted_idx], rfr.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

# 8. model-2, BaggingRegressor comparison

In [None]:
# whole data
br = BaggingRegressor(random_state=2404)

br.fit(X_train, y_train)
print(br.score(X_train, y_train))
print(br.score(X_test, y_test))

br_pred = br.predict(X_test)
br_rmse = np.sqrt(mean_squared_error(y_test, br_pred))
print(br_rmse)

In [None]:
# 1. divided data - feature_1st

br.fit(X_train_1st, y_train_1st)
print(br.score(X_train_1st, y_train_1st))
print(br.score(X_test_1st, y_test_1st))

br_pred_1st = br.predict(X_test_1st)
br_rmse_1st = np.sqrt(mean_squared_error(y_test_1st, br_pred_1st))
print(br_rmse_1st)

In [None]:
# 2. divided data - feature_2nd

br.fit(X_train_2nd, y_train_2nd)
print(br.score(X_train_2nd, y_train_2nd))
print(br.score(X_test_2nd, y_test_2nd))

br_pred_2nd = br.predict(X_test_2nd)
br_rmse_2nd = np.sqrt(mean_squared_error(y_test_2nd, br_pred_2nd))
print(br_rmse_2nd)

In [None]:
# 3. divided data - feature_3rd

br.fit(X_train_3rd, y_train_3rd)
print(br.score(X_train_3rd, y_train_3rd))
print(br.score(X_test_3rd, y_test_3rd))

br_pred_3rd = br.predict(X_test_3rd)
br_rmse_3rd = np.sqrt(mean_squared_error(y_test_3rd, br_pred_3rd))
print(br_rmse_3rd)

In [None]:
fig, ax = plt.subplots(ncols=4,figsize=(18, 3))

sns.regplot(x = br_pred, y = y_test, ax=ax[0])
sns.regplot(x = br_pred_1st, y = y_test_1st, ax=ax[1])
sns.regplot(x = br_pred_2nd, y = y_test_2nd, ax=ax[2])
sns.regplot(x = br_pred_3rd, y = y_test_3rd, ax=ax[3])

plt.show()

# 9. model-3, DecisionTreeRegressor Comparison

In [None]:
# whole data
dtr = DecisionTreeRegressor(random_state=2404)
dtr.fit(X_train, y_train)
dtr.fit(X_train, y_train)
print(dtr.score(X_train, y_train))
print(dtr.score(X_test, y_test))

dtr_pred = dtr.predict(X_test)
dtr_rmse = np.sqrt(mean_squared_error(y_test, dtr_pred))
print(dtr_rmse)

In [None]:
# 1. divided data - feature_1st

dtr.fit(X_train_1st, y_train_1st)
print(dtr.score(X_train_1st, y_train_1st))
print(dtr.score(X_test_1st, y_test_1st))

dtr_pred_1st = dtr.predict(X_test_1st)
dtr_rmse_1st = np.sqrt(mean_squared_error(y_test_1st, br_pred_1st))
print(dtr_rmse_1st)

In [None]:
# 2. divided data - feature_2nd

dtr.fit(X_train_2nd, y_train_2nd)
print(dtr.score(X_train_2nd, y_train_2nd))
print(dtr.score(X_test_2nd, y_test_2nd))

dtr_pred_2nd = dtr.predict(X_test_2nd)
dtr_rmse_2nd = np.sqrt(mean_squared_error(y_test_2nd, dtr_pred_2nd))
print(dtr_rmse_2nd)

In [None]:
# 3. divided data - feature_3rd

dtr.fit(X_train_3rd, y_train_3rd)
print(dtr.score(X_train_3rd, y_train_3rd))
print(dtr.score(X_test_3rd, y_test_3rd))

dtr_pred_3rd = dtr.predict(X_test_3rd)
dtr_rmse_3rd = np.sqrt(mean_squared_error(y_test_3rd, dtr_pred_3rd))
print(dtr_rmse_3rd)

In [None]:
fig, ax = plt.subplots(ncols=4,figsize=(18, 3))

sns.regplot(x = dtr_pred, y = y_test, ax=ax[0])
sns.regplot(x = dtr_pred_1st, y = y_test_1st, ax=ax[1])
sns.regplot(x = dtr_pred_2nd, y = y_test_2nd, ax=ax[2])
sns.regplot(x = dtr_pred_3rd, y = y_test_3rd, ax=ax[3])

plt.show()

# 10. weight balanced Model

In [None]:
whole_preds = rfr_pred*6/10 + br_pred*3/10 + dtr_pred*1/10
whole_rmse = np.sqrt(mean_squared_error(y_test, whole_preds))
whole_rmse
# 22.64079703573876

In [None]:
preds_1st = rfr_pred_1st*6/10 + br_pred_1st*3/10 + dtr_pred_1st*1/10
rmse_1st = np.sqrt(mean_squared_error(y_test_1st, preds_1st))

preds_2nd = rfr_pred_2nd*6/10 + br_pred_2nd*3/10 + dtr_pred_2nd*1/10
rmse_2nd = np.sqrt(mean_squared_error(y_test_2nd, preds_2nd))

preds_3rd = rfr_pred_3rd*6/10 + br_pred_3rd*3/10 + dtr_pred_3rd*1/10
rmse_3rd = np.sqrt(mean_squared_error(y_test_3rd, preds_3rd))

print(rmse_1st, rmse_2nd, rmse_3rd)

In [None]:
preds_1st_df = pd.DataFrame({'preds':(preds_1st), 'y_test':y_test_1st})
preds_2nd_df = pd.DataFrame({'preds':(preds_2nd), 'y_test':y_test_2nd})
preds_3rd_df = pd.DataFrame({'preds':(preds_3rd), 'y_test':y_test_3rd})
print(preds_1st_df.shape, preds_2nd_df.shape, preds_3rd_df.shape)
preds_1st_df.head()

In [None]:
preds_123_df = pd.concat([preds_1st_df, preds_2nd_df, preds_3rd_df], axis=0).sort_index(ascending=True)
print(preds_123_df.shape)
preds_123_df.head()

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12, 4))

sns.regplot(x = whole_preds, y = y_test, ax=ax[0])
sns.regplot(x = preds_123_df['preds'], y = preds_123_df['y_test'], ax=ax[1])

plt.show()

# 11. Stacking of whole data models

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_f_n, y_train_f_n, X_test_n, n_folds):
    
    kf = KFold(n_splits = n_folds, shuffle = True, random_state=2301)
    
    train_fold_pred = np.zeros((X_train_f_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__, 'model start')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_f_n)):
        
        print('\t fold set: ', folder_counter, 'start')
        X_tr = X_train_f_n[train_index]
        y_tr = y_train_f_n[train_index]
        X_te = X_train_f_n[valid_index]
        
        model.fit(X_tr, y_tr)
        
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        
        test_pred[:, folder_counter] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [None]:
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    
    kf = KFold(n_splits = n_folds, shuffle = True, random_state=2404)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__, 'model start')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        
        print('\t fold set: ', folder_counter, 'start')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        model.fit(X_tr, y_tr)
        
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        
        test_pred[:, folder_counter] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [None]:
X_train_n = X_train.values
X_test_n = X_test.values
y_train_n = y_train.values

rfr_train, rfr_test = get_stacking_base_datasets(rfr, X_train_n, y_train_n, X_test_n, 10)
br_train, br_test = get_stacking_base_datasets(br, X_train_n, y_train_n, X_test_n, 10)
dtr_train, dtr_test = get_stacking_base_datasets(dtr, X_train_n, y_train_n, X_test_n, 10) 

In [None]:
# stacking
Stack_final_X_train = np.concatenate((rfr_train, br_train, dtr_train), axis=1)
Stack_final_X_test = np.concatenate((rfr_test, br_test, dtr_test), axis=1)

# final meta model
from sklearn.linear_model import Lasso
meta_model_lasso = Lasso(alpha=0.0005, random_state=2404)

# final stacking model
meta_model_lasso.fit(Stack_final_X_train, y_train)
final = meta_model_lasso.predict(Stack_final_X_test)
rmse = np.sqrt(mean_squared_error(y_test, final))

print('RMSE of the final stacking model:', rmse)

# stacking rmse 22.816223769870685 vs wbm rmse 22.64079703573876
# to get the higher accuracy (lower rmse), add the other models in stacking process

> Please let me know your comments. Welcome and highly appreciate. 