# Introduction
***
Notebook **#2-ensemble_modelling_daun_singkong_eksperimen_1** mengimplementasikan sesuai tahapan yang diusulkan dari paper acuan

# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# sklearn for utilization
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold

# modelling for ensemble method
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# visualization purpose
import matplotlib.pyplot as plt

# utils
import pickle

#
from models.model_ensembles import EnsembleStacking

# Load Data After Preprocessing

In [2]:
dataset_daun_singkong_after_preprocessing = pickle.load(open('../dataset/data_daun_singkong_after_preprocessing.pkl', 'rb'))
X_train = dataset_daun_singkong_after_preprocessing['training']['X']
y_train = dataset_daun_singkong_after_preprocessing['training']['y']
X_test = dataset_daun_singkong_after_preprocessing['testing']['X']
y_test = dataset_daun_singkong_after_preprocessing['testing']['y']

In [3]:
# sampling data
# X_train = X_train[:1000]
# X_test = X_test[:1000]
# y_train = y_train[:1000]
# y_test = y_train[:1000]

# Defining Global Variables 

In [4]:
kfold = StratifiedKFold(n_splits=5, random_state=45, shuffle=True)

# global variables for MLP
max_iter=400 
batch_size=32 
verbose=1
n_iter_no_change=20

# Defining Global Functions

In [5]:
def create_new_input_features(ensemble_classifiers, X, y):
    all_predicted_results = dict()
    list_of_majority_voting_each_models = list()

    for model_name, models in ensemble_classifiers.items():
        print("\t\t\t", model_name.upper())

        # voting scenario for new data input prepration for ANN model
        if len(models)!=0:
            all_predicted_results[model_name] = dict()
            all_sub_model_name = list()
            for sub_model_name, dict_models in models.items():
                all_predicted_results[model_name][sub_model_name] = dict_models['train'].predict(X)
                all_sub_model_name.append(sub_model_name)

            # make dataframe for 5 model prediction results on X and get the mode label for that 5 prediction
            df_results_for_each_classifiers = pd.DataFrame(all_predicted_results[model_name])
            df_results_for_each_classifiers.columns = [model_name+"_"+sub_model_name for sub_model_name in all_sub_model_name]
            list_of_majority_voting_each_models.append(df_results_for_each_classifiers)
            
    new_input_features = pd.concat(list_of_majority_voting_each_models, axis=1)
    new_input_features['ground_truth'] = y.copy()
            
    return new_input_features

# Modelling

## Create Stacking ML Ensemble from 5-fold cross-validation

In [6]:
ensemble_classifiers = EnsembleStacking(X_train, y_train, X_test, y_test, kfold).train_ensemble()

In [7]:
ensemble_classifiers['svm']

{'model-1': {'train': SVC(),
  'training': 0.6242383774309324,
  'validation': 0.6224966622162884,
  'testing': 0.623208722741433},
 'model-2': {'train': SVC(),
  'training': 0.6232367915866789,
  'validation': 0.6244993324432577,
  'testing': 0.6228971962616823},
 'model-3': {'train': SVC(),
  'training': 0.6251043231513937,
  'validation': 0.6227045075125208,
  'testing': 0.6244548286604361},
 'model-4': {'train': SVC(),
  'training': 0.6260223668836589,
  'validation': 0.6243739565943238,
  'testing': 0.6250778816199377},
 'model-5': {'train': SVC(),
  'training': 0.6259389083625438,
  'validation': 0.6223706176961603,
  'testing': 0.6253894080996885}}

## Create new input Training from the stacking ensemble ML

In [8]:
new_input_training_features = create_new_input_features(
    ensemble_classifiers, 
    X_train,
    y_train
)

			 SVM
			 LOGREG
			 NAIVE_BAYES
			 DECISION_TREE


In [9]:
new_input_training_features

Unnamed: 0,svm_model-1,svm_model-2,svm_model-3,svm_model-4,svm_model-5,logreg_model-1,logreg_model-2,logreg_model-3,logreg_model-4,logreg_model-5,...,naive_bayes_model-2,naive_bayes_model-3,naive_bayes_model-4,naive_bayes_model-5,decision_tree_model-1,decision_tree_model-2,decision_tree_model-3,decision_tree_model-4,decision_tree_model-5,ground_truth
0,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,4,3,4,4,4,4
1,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,1,1,1,1,1,1
2,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,0,1,1,1,1,1
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,4,1,4,4,4,4
4,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14972,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,4,3,3,3
14973,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
14974,3,3,3,3,3,4,4,4,4,4,...,3,3,3,3,1,1,1,3,1,1
14975,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


## Feed New Input features into ANN

In [10]:
# split X and y from new_input_features before feeding to ANN
new_X_train, new_y_train = new_input_training_features.drop(['ground_truth'],axis=1), new_input_training_features['ground_truth']
# new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=45)

# feed new X and new y into ANN
ann_model = MLPClassifier(max_iter=max_iter, batch_size=batch_size, verbose=verbose, n_iter_no_change=n_iter_no_change)
ann_model.fit(new_X_train, new_y_train)

predicted_ann_train = ann_model.predict(new_X_train)

print()
print()
print("Final Accuracy Score [Mls Ensembles + ANN] on Training Data = ", accuracy_score(new_y_train, predicted_ann_train))

Iteration 1, loss = 0.49697949
Iteration 2, loss = 0.16508487
Iteration 3, loss = 0.10228276
Iteration 4, loss = 0.07358428
Iteration 5, loss = 0.05437304
Iteration 6, loss = 0.04204114
Iteration 7, loss = 0.03268194
Iteration 8, loss = 0.02596511
Iteration 9, loss = 0.02101593
Iteration 10, loss = 0.01666782
Iteration 11, loss = 0.01353900
Iteration 12, loss = 0.01112027
Iteration 13, loss = 0.00961820
Iteration 14, loss = 0.00778918
Iteration 15, loss = 0.00655323
Iteration 16, loss = 0.00612966
Iteration 17, loss = 0.00507928
Iteration 18, loss = 0.00493980
Iteration 19, loss = 0.00362086
Iteration 20, loss = 0.00462912
Iteration 21, loss = 0.00296682
Iteration 22, loss = 0.00251467
Iteration 23, loss = 0.00281206
Iteration 24, loss = 0.00239468
Iteration 25, loss = 0.00196013
Iteration 26, loss = 0.00187516
Iteration 27, loss = 0.00151923
Iteration 28, loss = 0.00121013
Iteration 29, loss = 0.00112605
Iteration 30, loss = 0.00360007
Iteration 31, loss = 0.00085908
Iteration 32, los

# Save Stacking Model Pretrained (MLs + ANN)

In [11]:
# save trained ensemble stacking classifiers
model_ensemble_experiment_3 = {
    'stacking_ensembles': ensemble_classifiers,
    'ann': ann_model,
    'data': {
        'training': {
            'X': X_train,
            'y': y_train
        },
        'testing': {
            'X': X_test,
            'y': y_test
        }
    }
}
pickle.dump(model_ensemble_experiment_3, open("model_ensemble_experiment_results/model_ensemble_eksperimen_3.pkl", 'wb'))