# Introduction
***
Notebook **#2-ensemble_modelling_daun_singkong_eksperimen_1** mengimplementasikan sesuai tahapan yang diusulkan dari paper acuan

# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# sklearn for utilization
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold

# modelling for ensemble method
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# visualization purpose
import matplotlib.pyplot as plt

# utils
import pickle

#
from models.model_ensembles import EnsembleStacking

# Load Data After Preprocessing

In [2]:
dataset_daun_singkong_after_preprocessing = pickle.load(open('../dataset/data_daun_singkong_after_preprocessing.pkl', 'rb'))
X_train = dataset_daun_singkong_after_preprocessing['training']['X']
y_train = dataset_daun_singkong_after_preprocessing['training']['y']
X_test = dataset_daun_singkong_after_preprocessing['testing']['X']
y_test = dataset_daun_singkong_after_preprocessing['testing']['y']

In [3]:
# sampling data
# X_train = X_train[:1000]
# X_test = X_test[:1000]
# y_train = y_train[:1000]
# y_test = y_train[:1000]

# Defining Global Variables 

In [4]:
kfold = StratifiedKFold(n_splits=5, random_state=45, shuffle=True)

# global variables for MLP
max_iter=400 
batch_size=32 
verbose=0
n_iter_no_change=20

# Defining Global Functions

In [5]:
def create_new_input_training_features(ensemble_classifiers, X, y):
    all_predicted_results = dict()
    list_of_majority_voting_each_models = list()
    list_of_models = list()

    for model_name, models in ensemble_classifiers.items():
        print("\t\t\t", model_name.upper())

        # voting scenario for new data input prepration for ANN model
        if len(models)!=0:
            all_predicted_results[model_name] = dict()
            for sub_model_name, dict_models in models.items():
                all_predicted_results[model_name][sub_model_name] = dict_models['train'].predict(X)

            # make dataframe for 5 model prediction results on X and get the mode label for that 5 prediction
            model_df_voting = pd.DataFrame(all_predicted_results[model_name]).mode(axis=1)[0]
            list_of_models.append('majority_vote_from_'+model_name)
            list_of_majority_voting_each_models.append(model_df_voting)
            
    new_input_features = pd.concat(list_of_majority_voting_each_models, axis=1)
    new_input_features.columns = list_of_models
    new_input_features['ground_truth'] = y.copy()
            
    return new_input_features

# Modelling

## Create Stacking ML Ensemble from 5-fold cross-validation

In [6]:
ensemble_classifiers = EnsembleStacking(X_train, y_train, X_test, y_test, kfold).train_ensemble()

In [7]:
# create dataframe for easy understanding from ensemble classifiers stacking results
ensemble_classifiers_results = list()
for model in ensemble_classifiers.keys():
    ensemble_classifiers_results.append(pd.DataFrame(ensemble_classifiers[model]).transpose().sort_values(by=['testing'], ascending=False))

# concat all dataframe results
ensemble_classifiers_results = pd.concat(ensemble_classifiers_results, axis=0)
ensemble_classifiers_results

Unnamed: 0,train,training,validation,testing
model-5,SVC(),0.625939,0.622371,0.625389
model-4,SVC(),0.626022,0.624374,0.625078
model-3,SVC(),0.625104,0.622705,0.624455
model-1,SVC(),0.624238,0.622497,0.623209
model-2,SVC(),0.623237,0.624499,0.622897
model-4,LogisticRegression(solver='newton-cg'),0.625438,0.626377,0.625545
model-3,LogisticRegression(solver='newton-cg'),0.625772,0.62571,0.624766
model-5,LogisticRegression(solver='newton-cg'),0.626273,0.624708,0.624455
model-1,LogisticRegression(solver='newton-cg'),0.626826,0.623164,0.623832
model-2,LogisticRegression(solver='newton-cg'),0.624405,0.626502,0.623832


## Create new input Training from the stacking ensemble ML

In [8]:
new_input_training_features = create_new_input_training_features(
    ensemble_classifiers, 
    X_train,
    y_train
)

			 SVM
			 LOGREG
			 NAIVE_BAYES
			 DECISION_TREE


In [9]:
new_input_training_features

Unnamed: 0,majority_vote_from_svm,majority_vote_from_logreg,majority_vote_from_naive_bayes,majority_vote_from_decision_tree,ground_truth
0,3.0,3.0,3,4,4
1,3.0,3.0,3,1,1
2,3.0,3.0,3,1,1
3,3.0,3.0,3,4,4
4,3.0,3.0,3,3,3
...,...,...,...,...,...
14972,3.0,3.0,3,3,3
14973,3.0,3.0,3,3,3
14974,3.0,4.0,3,1,1
14975,3.0,3.0,3,3,3


## Feed New Input features into ANN

In [10]:
# split X and y from new_input_features before feeding to ANN
new_X_train, new_y_train = new_input_training_features.drop(['ground_truth'],axis=1), new_input_training_features['ground_truth']
# new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=45)

# feed new X and new y into ANN
ann_model = MLPClassifier(max_iter=max_iter, batch_size=batch_size, verbose=verbose, n_iter_no_change=n_iter_no_change)
ann_model.fit(new_X_train, new_y_train)

predicted_ann_train = ann_model.predict(new_X_train)

print()
print()
print("Final Accuracy Score [Mls Ensembles + ANN] on Training Data = ", accuracy_score(new_y_train, predicted_ann_train))



Final Accuracy Score [Mls Ensembles + ANN] on Training Data =  1.0


# Save Stacking Model Pretrained (MLs + ANN)

In [11]:
# save trained ensemble stacking classifiers
model_ensemble_experiment_1 = {
    'stacking_ensembles': ensemble_classifiers,
    'ann': ann_model,
    'data': {
        'training': {
            'X': X_train,
            'y': y_train
        },
        'testing': {
            'X': X_test,
            'y': y_test
        }
    }
}
pickle.dump(model_ensemble_experiment_1, open("model_ensemble_experiment_results/model_ensemble_eksperimen_1.pkl", 'wb'))