## Import Libraries

In [18]:
import pandas as pd
import numpy as np

# sklearn for utilization
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# modelling for ensemble method
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# visualization purpose
import matplotlib.pyplot as plt

# utils
import pickle

## Data Preparation

In [19]:
daun_singkong_lbp_df = pd.read_csv('../../dataset/Fitur_LBPuniform_Cassava Leaf.csv', header=None, na_values=np.nan)
daun_singkong_lbp_df = daun_singkong_lbp_df.copy().sample(5000)

In [20]:
daun_singkong_lbp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
12701,8122,4655,2891,1965,1361,1148,934,871,915,950,...,1001,1049,1252,1457,1984,2885,4111,7988,64966,3
12917,7309,4240,2406,1624,1444,1309,1202,1264,1243,1462,...,1528,1491,1728,1838,2137,2805,3608,7018,56743,4
7818,7181,4000,2823,2050,1630,1369,1325,1390,1362,1462,...,1518,1614,1577,1873,2156,2823,3594,7569,58342,1
5537,8347,4875,2657,1491,1122,822,717,804,727,846,...,889,860,1046,1248,1801,2720,4532,8523,66538,3
17128,8304,4950,2778,1725,1184,871,796,778,778,853,...,953,950,1187,1369,1868,2829,4292,7898,67263,3


In [21]:
# dataset splitting 
X, y = daun_singkong_lbp_df.drop([26], axis=1), daun_singkong_lbp_df[26]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
12701,8122,4655,2891,1965,1361,1148,934,871,915,950,...,944,1001,1049,1252,1457,1984,2885,4111,7988,64966
12917,7309,4240,2406,1624,1444,1309,1202,1264,1243,1462,...,1438,1528,1491,1728,1838,2137,2805,3608,7018,56743
7818,7181,4000,2823,2050,1630,1369,1325,1390,1362,1462,...,1536,1518,1614,1577,1873,2156,2823,3594,7569,58342
5537,8347,4875,2657,1491,1122,822,717,804,727,846,...,840,889,860,1046,1248,1801,2720,4532,8523,66538
17128,8304,4950,2778,1725,1184,871,796,778,778,853,...,983,953,950,1187,1369,1868,2829,4292,7898,67263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8548,8314,4765,2814,1676,1192,927,775,827,865,931,...,959,951,1013,1112,1383,1882,2673,4448,8332,67189
15260,9556,5638,2608,1342,858,641,548,469,492,458,...,505,543,583,781,999,1497,2724,4974,8727,72437
7468,7304,3807,2714,2012,1759,1614,1528,1488,1418,1619,...,1538,1523,1646,1875,2166,2439,2836,3552,7351,55628
20977,7505,4321,2709,1879,1452,1157,1049,1045,1100,1163,...,1163,1175,1175,1385,1589,2115,2969,3997,7215,63273


In [23]:
y

12701    3
12917    4
7818     1
5537     3
17128    3
        ..
8548     3
15260    3
7468     3
20977    3
628      3
Name: 26, Length: 5000, dtype: int64

## Defining Global Variables

In [25]:
kfold = StratifiedKFold(n_splits=5, random_state=45, shuffle=True)

# Data Preprocesssing

## Data Normalization

In [27]:
#using min-max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
X_train

array([[0.73190943, 0.67247525, 0.56345776, ..., 0.62600253, 0.24345372,
        0.80260028],
       [0.54516807, 0.44277228, 0.57328094, ..., 0.51730688, 0.21915016,
        0.60391131],
       [0.62710084, 0.48554455, 0.74774067, ..., 0.48860279, 0.21434171,
        0.61133809],
       ...,
       [0.66328198, 0.61722772, 0.78624754, ..., 0.45272267, 0.2425652 ,
        0.66370685],
       [0.63340336, 0.59524752, 0.64715128, ..., 0.63655551, 0.23284378,
        0.72664379],
       [0.64927638, 0.63069307, 0.64047151, ..., 0.61756015, 0.26216485,
        0.76566614]])

### Label Encoding Target

In [29]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# Modelling

## Create Stacking ML Ensemble from 5-fold cross-validation

In [30]:
ensemble_classifiers = {
    'svm' : dict(),
    'logreg': dict(),
    'naive_bayes': dict(),
    'decision_tree': dict()
}

for idx, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
    
    # split training set into train and val set
    X_latih, X_validasi = X_train[train_index], X_train[val_index]
    y_latih, y_validasi = y_train[train_index], y_train[val_index]
    
    # train 5 model of SVM
    svm = SVC()
    svm.fit(X_latih, y_latih)
    predicted_svm = svm.predict(X_validasi)
    
    ensemble_classifiers['svm']['model-'+str(idx+1)] = {
        'train':svm,
        'validation': accuracy_score(y_validasi, predicted_svm)
    }
    
    # train 5 model of Naive Bayes
    naive_bayes = MultinomialNB()
    naive_bayes.fit(X_latih, y_latih)
    predicted_naive_bayes = naive_bayes.predict(X_validasi)
    
    ensemble_classifiers['naive_bayes']['model-'+str(idx+1)] = {
        'train':naive_bayes,
        'validation': accuracy_score(y_validasi, predicted_naive_bayes)
    }
    
    # train 5 model of Decision Tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_latih, y_latih)
    predicted_decision_tree = decision_tree.predict(X_validasi)
    
    ensemble_classifiers['decision_tree']['model-'+str(idx+1)] = {
        'train':decision_tree,
        'validation': accuracy_score(y_validasi, predicted_decision_tree)
    }
    
    # train 5 model of logReg
    log_reg = LogisticRegression(solver='newton-cg')
    log_reg.fit(X_latih, y_latih)
    predicted_log_reg = log_reg.predict(X_validasi)
    
    ensemble_classifiers['logreg']['model-'+str(idx+1)] = {
        'train':log_reg,
        'validation': accuracy_score(y_validasi, predicted_log_reg)
    }

In [31]:
ensemble_classifiers

{'svm': {'model-1': {'train': SVC(), 'validation': 0.6228571428571429},
  'model-2': {'train': SVC(), 'validation': 0.6285714285714286},
  'model-3': {'train': SVC(), 'validation': 0.62},
  'model-4': {'train': SVC(), 'validation': 0.6157142857142858},
  'model-5': {'train': SVC(), 'validation': 0.6214285714285714}},
 'logreg': {'model-1': {'train': LogisticRegression(solver='newton-cg'),
   'validation': 0.6285714285714286},
  'model-2': {'train': LogisticRegression(solver='newton-cg'),
   'validation': 0.6242857142857143},
  'model-3': {'train': LogisticRegression(solver='newton-cg'),
   'validation': 0.6271428571428571},
  'model-4': {'train': LogisticRegression(solver='newton-cg'),
   'validation': 0.6185714285714285},
  'model-5': {'train': LogisticRegression(solver='newton-cg'),
   'validation': 0.6185714285714285}},
 'naive_bayes': {'model-1': {'train': MultinomialNB(),
   'validation': 0.6114285714285714},
  'model-2': {'train': MultinomialNB(), 'validation': 0.6114285714285714

## Training the stacking ensemble ML

In [32]:
all_predicted_results = dict()
list_of_majority_voting_each_models = list()

for model_name, models in ensemble_classifiers.items():
    print("\t\t\t", model_name.upper())
    
    # voting scenario for data training input prepration for ANN model
    if len(models)!=0:
        all_predicted_results[model_name] = dict()
        for sub_model_name, dict_models in models.items():
            print('\t\t Training Model {} using {}'.format(model_name.upper(), sub_model_name))
            all_predicted_results[model_name][sub_model_name] = dict_models['train'].predict(X_train)
        
        # make dataframe for 5 model prediction results on X_train and get the mode label for that 5 prediction
        model_df_voting = pd.DataFrame(all_predicted_results[model_name]).mode(axis=1)[0]
        model_df_voting.columns = ['majority_vote_from_'+model_name]
        list_of_majority_voting_each_models.append(model_df_voting)

			 SVM
		 Training Model SVM using model-1
		 Training Model SVM using model-2
		 Training Model SVM using model-3
		 Training Model SVM using model-4
		 Training Model SVM using model-5
			 LOGREG
		 Training Model LOGREG using model-1
		 Training Model LOGREG using model-2
		 Training Model LOGREG using model-3
		 Training Model LOGREG using model-4
		 Training Model LOGREG using model-5
			 NAIVE_BAYES
		 Training Model NAIVE_BAYES using model-1
		 Training Model NAIVE_BAYES using model-2
		 Training Model NAIVE_BAYES using model-3
		 Training Model NAIVE_BAYES using model-4
		 Training Model NAIVE_BAYES using model-5
			 DECISION_TREE
		 Training Model DECISION_TREE using model-1
		 Training Model DECISION_TREE using model-2
		 Training Model DECISION_TREE using model-3
		 Training Model DECISION_TREE using model-4
		 Training Model DECISION_TREE using model-5


In [33]:
new_input_training_features = pd.concat(list_of_majority_voting_each_models, axis=1)
new_input_training_features['ground_truth'] = y_train.copy()

In [39]:
new_input_training_features

Unnamed: 0,0,0.1,0.2,0.3,ground_truth
0,3,3.0,3,3,3
1,3,3.0,3,2,2
2,3,3.0,3,3,3
3,3,3.0,3,2,2
4,3,3.0,3,3,3
...,...,...,...,...,...
3495,3,3.0,3,3,3
3496,3,3.0,3,3,3
3497,3,3.0,3,3,3
3498,3,3.0,3,3,3


## Feature Selection

In [47]:
data_new = new_input_training_features.loc[:, new_input_training_features.std(axis=0) > 0.75]

In [48]:
data_new

Unnamed: 0,0,ground_truth
0,3,3
1,2,2
2,3,3
3,2,2
4,3,3
...,...,...
3495,3,3
3496,3,3
3497,3,3
3498,3,3


## Feed New Input features into ANN

In [49]:
# split X and y from new_input_features before feeding to ANN
new_X_train, new_y_train = data_new.drop(['ground_truth'],axis=1), data_new['ground_truth']
# new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=45)

# feed new X and new y into ANN
ann_model = MLPClassifier(max_iter=400)
ann_model.fit(new_X_train, new_y_train)

predicted_ann_train = ann_model.predict(new_X_train)

In [50]:
accuracy_score(new_y_train, predicted_ann_train)

1.0

# Testing Model

In [51]:
all_predicted_results = dict()
list_of_majority_voting_each_models = list()

for model_name, models in ensemble_classifiers.items():
    print("\t\t\t", model_name.upper())
    
    # voting scenario for data testing input prepration for ANN model
    if len(models)!=0:
        all_predicted_results[model_name] = dict()
        for sub_model_name, dict_models in models.items():
            print('\t\t Testing Model {} using {}'.format(model_name.upper(), sub_model_name))
            
            all_predicted_results[model_name][sub_model_name] = dict_models['train'].predict(X_test)
        
        # make dataframe for 5 model prediction results on X_test and get the mode label for that 5 prediction
        model_df_voting = pd.DataFrame(all_predicted_results[model_name]).mode(axis=1)[0]
        model_df_voting.columns = ['majority_vote_from_'+model_name]
        list_of_majority_voting_each_models.append(model_df_voting)

			 SVM
		 Testing Model SVM using model-1
		 Testing Model SVM using model-2
		 Testing Model SVM using model-3
		 Testing Model SVM using model-4
		 Testing Model SVM using model-5
			 LOGREG
		 Testing Model LOGREG using model-1
		 Testing Model LOGREG using model-2
		 Testing Model LOGREG using model-3
		 Testing Model LOGREG using model-4
		 Testing Model LOGREG using model-5
			 NAIVE_BAYES
		 Testing Model NAIVE_BAYES using model-1
		 Testing Model NAIVE_BAYES using model-2
		 Testing Model NAIVE_BAYES using model-3
		 Testing Model NAIVE_BAYES using model-4
		 Testing Model NAIVE_BAYES using model-5
			 DECISION_TREE
		 Testing Model DECISION_TREE using model-1
		 Testing Model DECISION_TREE using model-2
		 Testing Model DECISION_TREE using model-3
		 Testing Model DECISION_TREE using model-4
		 Testing Model DECISION_TREE using model-5


In [52]:
new_input_testing_features = pd.concat(list_of_majority_voting_each_models, axis=1)
new_input_testing_features['ground_truth'] = y_test.copy()

In [53]:
new_input_testing_features

Unnamed: 0,0,0.1,0.2,0.3,ground_truth
0,3,3.0,3.0,3.0,2
1,3,3.0,3.0,3.0,4
2,3,3.0,3.0,1.0,2
3,3,3.0,3.0,3.0,2
4,3,3.0,3.0,0.0,4
...,...,...,...,...,...
1495,3,3.0,3.0,4.0,4
1496,3,3.0,3.0,3.0,1
1497,3,3.0,3.0,3.0,3
1498,3,3.0,3.0,2.0,2


## Feature Selection

### Variance threshold

In [112]:
def variance_threshold_select(new_input_testing_features, thresh=0.0, na_replacement=-999):
    df1 = new_input_testing_features.copy(deep=True) # Make a deep copy of the dataframe
    selector = VarianceThreshold(thresh)
    selector.fit(new_input_testing_features.fillna(na_replacement)) # Fill NA values as VarianceThreshold cannot deal with those
    df2 = new_input_testing_features.loc[:,selector.get_support(indices=False)] # Get new dataframe with columns deleted that have NA values

    return df2

In [117]:
data_testing = variance_threshold_select(data_testing, 0.75)
data_testing



Unnamed: 0,0,ground_truth
0,3.0,2
1,3.0,4
2,1.0,2
3,3.0,2
4,0.0,4
...,...,...
1495,4.0,4
1496,3.0,1
1497,3.0,3
1498,2.0,2


In [118]:
# split X and y from data_testing before feeding to ANN
new_X_test, new_y_test = data_testing.drop(['ground_truth'],axis=1), data_testing['ground_truth']

# predict new X test using pre-trained ANN before
predicted_ann_testing = ann_model.predict(new_X_test)

In [119]:
accuracy_score(new_y_test, predicted_ann_testing)

0.558

In [120]:
print(classification_report(new_y_test, predicted_ann_testing))

              precision    recall  f1-score   support

           0       0.11      0.09      0.10        93
           1       0.23      0.18      0.20       154
           2       0.18      0.14      0.16       165
           3       0.69      0.81      0.75       923
           4       0.32      0.21      0.26       165

    accuracy                           0.56      1500
   macro avg       0.31      0.28      0.29      1500
weighted avg       0.51      0.56      0.53      1500

