In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler, binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import shap
shap.initjs()

# Read Dataset

In [None]:
dataset = 'Dataset.xlsx'
sheet_name = 'GoodDataset'
export_good_bad_dataset = False
outlier_removal = False

if dataset == 'Dataset.xlsx':
    num = 0
    df = pd.read_excel(f'Dataset/{dataset}', sheet_name=sheet_name)
else:
    file = dataset.split('-')[0]
    num = dataset.split('-')[1].split('.')[0]
    df = pd.read_csv(f'Dataset/{dataset}')

# with pd.option_context('display.max_rows', 6): display(df)

display(df)

print(df.shape)

class_count = df['Class'].value_counts()
print(class_count)

plt.bar(['Low Sweetness', 'High Sweetness'], [class_count[0], class_count[1]])
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Fill NaN with Mean

In [None]:
roundness_mean = df['Roundness'].mean()
max_frequency_mean = df['Max Frequency (Hz)'].mean()

print(f'Roundness Mean: {roundness_mean}')
print(f'Max Frequency (Hz) Mean: {max_frequency_mean}')

df['Roundness'].fillna(roundness_mean, inplace=True)
df['Max Frequency (Hz)'].fillna(max_frequency_mean, inplace=True)

# Features and Target Selection

In [None]:
feature = ['Weight (g)', 'Roundness', 'Lower Petal', 'Max Frequency (Hz)', 'Max Magnitude']
X = df[feature]
y = df['Class']

# Dataset Visualization

## Pearson Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df[feature + ['Sweetness (%Brix)']].corr(), cmap="Blues", square=True, annot=True)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.savefig('pearson_correlation.pdf', dpi=600,bbox_inches='tight') 
plt.savefig('pearson_correlation.png', dpi=600, bbox_inches='tight')

plt.show() 

## T-SNE

In [None]:
X_normalized = StandardScaler().fit_transform(X)

X_embedded = TSNE(n_components=2, 
                  learning_rate='auto',
                  init='random',
                  verbose=1,
                  perplexity=3, 
                  random_state=42).fit_transform(X_normalized)

df_tsne = pd.DataFrame({'Dimension 1': X_embedded[:, 0],
                        'Dimension 2': X_embedded[:, 1],
                        'Class': y})

# pd.cut(df['Sweetness (%Brix)'],
#                                         bins=[0, 18, 999],
#                                         labels=['Low Sweetness', 'High Sweetness'],
#                                         right=False)

df_tsne['Class'] = ['High Sweetness' if i == 1 else 'Low Sweetness' for i in df_tsne['Class']]

palette = [sns.color_palette('RdBu')[1], sns.color_palette('RdBu')[-1]]

plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_tsne,
    x='Dimension 1', 
    y='Dimension 2',
    palette=palette,
    hue='Class',
    s=200,
    legend="full",
)
plt.xlabel('Dimension 1', fontsize=20, labelpad=20)
plt.xticks(fontsize=20)
plt.ylabel('Dimension 2', fontsize=20, labelpad=20)
plt.yticks(fontsize=20)

plt.legend(loc='best', fontsize=20)

plt.savefig('t_sne.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('t_sne.png', dpi=600, bbox_inches='tight')

plt.show()

## Boxplot

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(data=X, 
            palette=sns.color_palette('Blues'),
            width=0.4,
            flierprops={'marker': 'o', 
                        'markersize': 10, 
                        'markerfacecolor': 'None', 
                        'markeredgecolor': 'black'})

plt.xticks(fontsize=20, rotation=45)
plt.yticks(fontsize=20)

plt.savefig('box_plot.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('box_plot.png', dpi=600, bbox_inches='tight')

plt.show()

# IQR

In [None]:
def remove_outlier(_df, column):
    Q1, Q3 = _df[column].quantile(0.25), _df[column].quantile(0.75)

    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    no_outlier = _df[(_df[column] > lower_limit) & (_df[column] < upper_limit)]
    
    return no_outlier

In [None]:
df_no_outlier = remove_outlier(df, 'Weight (g)')
df_no_outlier = remove_outlier(df_no_outlier, 'Roundness')
df_no_outlier = remove_outlier(df_no_outlier, 'Max Frequency (Hz)')
df_no_outlier = remove_outlier(df_no_outlier, 'Max Magnitude')

display(df_no_outlier)

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(data=df_no_outlier[feature], 
            palette=sns.color_palette('Blues'),
            width=0.4,
            flierprops={'marker': 'o', 
                        'markersize': 10, 
                        'markerfacecolor': 'None', 
                        'markeredgecolor': 'black'})

plt.xticks(fontsize=20, rotation=45)
plt.yticks(fontsize=20)

plt.show()

In [None]:
if outlier_removal:
    X = df_no_outlier[feature]
    y = df_no_outlier['Class']

# Hyperparameter

In [None]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=42)

score = {
    'auc': 'roc_auc',
    'accuracy': 'accuracy'
}

## KNN

In [None]:
standard_scaler = StandardScaler()

KNN = KNeighborsClassifier(n_jobs=-1)
KNN_parameter = {
    'model__leaf_size': list(range(1, 15)),
    'model__n_neighbors': list(range(1, 10, 2)),
    'model__p': np.arange(1.0, 3.0, 0.4),
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', KNN)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=KNN_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0,
                           return_train_score=True)
grid_search.fit(X, y)

print('KNN Best Estimator: ', grid_search.best_estimator_)
print('KNN Best Parameter: ', grid_search.best_params_)
print('KNN Best Score: ', grid_search.best_score_)

In [None]:
KNN = grid_search.best_estimator_['model']
KNN

## SVM

In [None]:
standard_scaler = StandardScaler()

SVM = SVC(probability=True)
SVM_parameter = [
    {'model__kernel': ['linear'], 
     'model__C': np.arange(1.0, 3.0, 0.2)},
    {'model__kernel': ['rbf'], 
     'model__C': np.arange(1.0, 3.0, 0.2), 
     'model__gamma': list(range(1, 10))}
]

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', SVM)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=SVM_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('SVM Best Estimator: ', grid_search.best_estimator_)
print('SVM Best Parameter: ', grid_search.best_params_)
print('SVM Best Score: ', grid_search.best_score_)

In [None]:
SVM = grid_search.best_estimator_['model']
SVM

## Logistic Regression

In [None]:
standard_scaler = StandardScaler()

LR = LogisticRegression()
LR_parameter = [
    {'model__solver': ['newton-cg', 'lbfgs', 'sag'], 
     'model__penalty': ['l2']},
    {'model__solver': ['liblinear'], 
     'model__penalty': ['l1', 'l2']},
    {'model__C': [0.001, 0.01, 0.1, 1, 10, 100]}
]

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', LR)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=LR_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('Logistic Regression Best Estimator: ', grid_search.best_estimator_)
print('Logistic Regression Best Parameter: ', grid_search.best_params_)
print('Logistic Regression Best Score: ', grid_search.best_score_)

In [None]:
LR = grid_search.best_estimator_['model']
LR

## Decision Tree

In [None]:
standard_scaler = StandardScaler()

DT = DecisionTreeClassifier(criterion='entropy', random_state=42)
DT_parameter = {
    'model__max_depth': list(range(1, 15)),
    'model__max_leaf_nodes': list(range(2, 15))
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', DT)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=DT_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('Decision Tree Best Estimator: ', grid_search.best_estimator_)
print('Decision Tree Best Parameter: ', grid_search.best_params_)
print('Decision Tree Best Score: ', grid_search.best_score_)

In [None]:
DT = grid_search.best_estimator_['model']
DT

## XGBoost

In [None]:
standard_scaler = StandardScaler()

XGB = XGBClassifier(learning_rate=0.02, objective='binary:logistic', nthread=1)
XGB_parameter = {
    'model__min_child_weight': list(range(1, 5)),
    'model__gamma': list(range(1, 3)),
    'model__subsample': [0.2, 0.4, 0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__n_estimators': [500, 1000],
    'model__max_depth': list(range(1, 5))
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', XGB)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=XGB_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('XGBoost Best Estimator: ', grid_search.best_estimator_)
print('XGBoost Best Parameter: ', grid_search.best_params_)
print('XGBoost Best Score: ', grid_search.best_score_)

In [None]:
XGB = grid_search.best_estimator_['model']
XGB

## LightGBM

In [None]:
standard_scaler = StandardScaler()

LGBM = LGBMClassifier(learning_rate=0.02, objective='binary', n_jobs=-1, random_state=42)
LGBM_parameter = {
    'model__num_leaves': list(range(2, 5)),
    'model__max_depth': list(range(1, 4)),
    'model__n_estimators': [500, 1000],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', LGBM)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=LGBM_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy', 
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('LightGBM Best Estimator: ', grid_search.best_estimator_)
print('LightGBM Best Parameter: ', grid_search.best_params_)
print('LightGBM Best Score: ', grid_search.best_score_)

In [None]:
LGBM = grid_search.best_estimator_['model']
LGBM

## Artificial Neural Network

In [None]:
standard_scaler = StandardScaler()

ANN = MLPClassifier(max_iter=50000, random_state=42)
ANN_parameter = {
    'model__hidden_layer_sizes': [(2, 2), (3, 4), (4, 2), (4, 4)],
    'model__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'model__solver': ['lbfgs', 'sgd', 'adam']
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', ANN)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=ANN_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, verbose=0)
grid_search.fit(X, y)

print('Artificial Neural Network Best Estimator: ', grid_search.best_estimator_)
print('Artificial Neural Network Best Parameter: ', grid_search.best_params_)
print('Artificial Neural Network Best Score: ', grid_search.best_score_)

In [None]:
ANN = grid_search.best_estimator_['model']
ANN

## Random Forest

In [None]:
standard_scaler = StandardScaler()

RF = RandomForestClassifier(criterion='gini', random_state=42)
RF_parameter = {
    'model__n_estimators': list(range(1, 30)),
    'model__max_depth': list(range(1, 15)),
}

pipeline = Pipeline([('standard_scaler', standard_scaler), ('model', RF)])

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=RF_parameter, 
                           cv=cross_validation, 
                           scoring=score, 
                           refit='accuracy',
                           n_jobs=-1, 
                           verbose=0)
grid_search.fit(X, y)

print('Random Forest Best Estimator: ', grid_search.best_estimator_)
print('Random Forest Best Parameter: ', grid_search.best_params_)
print('Random Forest Best Score: ', grid_search.best_score_)

In [None]:
RF = grid_search.best_estimator_['model']
RF

## Voting Classifier

In [None]:
estimator = [
    ('KNN', KNN),
    ('SVM', SVM),
    ('LR', LR),
    ('DT', DT),
    ('XGB', XGB),
    ('LGBM', LGBM),
    ('ANN', ANN),
    ('RF', RF)
]
VC = VotingClassifier(estimator, 
                          weights=[0.15, 0.1, 0.05, 0.1, 0.3, 0.1, 0.1, 0.1], 
                          voting='soft', 
                          n_jobs=-1)
VC

# Train the Model

In [None]:
initial_model_name = {
    'KNN': 'KNN',
    'SVM': 'SVM',
    'LR': 'Logistic Regression',
    'DT': 'Decision Tree',
    'XGB': 'XGBoost',
    'LGBM': 'LightGBM',
    'ANN': 'ANN',
    'RF': 'Random Forest',
    'VC': 'Voting Classifier'
}

In [None]:
model_y_true = {
    'index': [],
    'no': [],
    'value': []
}

model_y_predict = {
    'KNN': [],
    'SVM': [],
    'LR': [],
    'DT': [],
    'XGB': [],
    'LGBM': [],
    'ANN': [],
    'RF': [],
    'VC': []
}

model_y_predict_score = {
    'KNN': [],
    'SVM': [],
    'LR': [],
    'DT': [],
    'XGB': [],
    'LGBM': [],
    'ANN': [],
    'RF': [],
    'VC': []
}

classifier = [
    ('KNN', KNN),
    ('SVM', SVM),
    ('LR', LR),
    ('DT', DT),
    ('XGB', XGB),
    ('LGBM', LGBM),
    ('ANN', ANN),
    ('RF', RF),
    ('VC', VC)
]

for train_index, test_index in cross_validation.split(X, y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]
    
    model_y_true['index'] = np.append(model_y_true['index'], test_index)
    model_y_true['no'] = np.append(model_y_true['no'], df.iloc[test_index, 0])
    model_y_true['value'] = np.append(model_y_true['value'], y_test)

    standard_scaler = StandardScaler()
    X_train = standard_scaler.fit_transform(X_train)
    X_test = standard_scaler.transform(X_test)

    for initial, model in classifier:
        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        y_predict_score = model.predict_proba(X_test)
        model_y_predict[initial] = np.append(model_y_predict[initial], y_predict)
        model_y_predict_score[initial] = np.append(model_y_predict_score[initial], y_predict_score[:, 1])

# Evaluate the Model

## Accuracy, Precision, Recall, and F1 Score

In [None]:
model_report = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])

for initial in model_y_predict:
    accuracy = accuracy_score(model_y_true['value'], model_y_predict[initial]) * 100
    precision = precision_score(model_y_true['value'], model_y_predict[initial]) * 100
    recall = recall_score(model_y_true['value'], model_y_predict[initial]) * 100
    f1 = f1_score(model_y_true['value'], model_y_predict[initial]) * 100
    
    report = pd.DataFrame({'Model': [initial_model_name[initial]],
                           'Accuracy': [accuracy],
                           'Precision': [precision],
                           'Recall': [recall],
                           'F1': [f1]})
    
    model_report = pd.concat([model_report, report], axis=0)

display(model_report)

model_report.to_csv('Model-Report.csv', index=False)

model_report = pd.melt(model_report, id_vars=['Model'], var_name='Metric', value_name='Value')

plt.figure(figsize=(28, 12))
axis = sns.barplot(model_report, 
            x='Model', 
            y='Value', 
            hue='Metric', 
            palette=sns.color_palette('Blues', 4))

axis.set(xlabel=None)
plt.xticks(fontsize=20)
axis.set(ylabel=None)
plt.yticks(fontsize=20)

plt.legend(loc='lower left', fontsize=20)

plt.savefig('model_report.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('model_report.png', dpi=600, bbox_inches='tight')

plt.show()

## Confusion Matrix

In [None]:
row, column = 2, 5
figure, axis = plt.subplots(nrows=row, ncols=column, figsize=(16, 10))

for i, initial in enumerate(model_y_predict):
    sns.heatmap(confusion_matrix(model_y_true['value'], 
                                 model_y_predict[initial]), 
                ax=axis.flat[i],
                cmap='Blues',
                square=True, 
                annot=True)
    axis.flat[i].set_title(initial_model_name[initial])
    axis.flat[i].set_xlabel('Predicted label')
    axis.flat[i].set_ylabel('True label')

figure.tight_layout()

plt.savefig('confusion_matrix.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('confusion_matrix.png', dpi=600, bbox_inches='tight')

plt.show()

In [None]:
from matplotlib.colors import ListedColormap

color = sns.color_palette('Paired', 9)

figure, axis = plt.subplots(nrows=1, ncols=1, figsize=(16, 10))

for i, initial in enumerate(model_y_predict):
    RocCurveDisplay.from_predictions(model_y_true['value'], 
                                     model_y_predict_score[initial], 
                                     color=color[i], 
                                     name=initial_model_name[initial], 
                                     linewidth=3, 
                                     ax=axis)

plt.xlabel('False Positive Rate (Positive label: 1)', fontsize=20, labelpad=20)
plt.xticks(fontsize=20)
plt.ylabel('True Positive Rate (Positive label: 1)', fontsize=20, labelpad=20)
plt.yticks(fontsize=20)

plt.legend(loc='best', fontsize=20)

plt.savefig('roc.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('roc.png', dpi=600, bbox_inches='tight')

plt.show()

## SHAP Explaination

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=feature)
X_test = pd.DataFrame(X_test, columns=feature)

for initial, model in classifier:
    model.fit(X_train, y_train)

explainer = shap.KernelExplainer(ANN.predict_proba, X_train)
shap_value = explainer.shap_values(X_test)

shap.summary_plot(shap_value[1], X_test, show=False)

plt.savefig('shap.pdf', dpi=600, bbox_inches='tight') 
plt.savefig('shap.png', dpi=600, bbox_inches='tight')

# Dependence Dataset Prediction

In [None]:
count_prediction = pd.DataFrame({'index': model_y_true['index'],
                                 'no': model_y_true['no'],
                                 'value': model_y_true['value']})

for initial in model_y_predict:
    count_prediction[initial] = model_y_predict[initial]

count_prediction['Prediction Rate'] = count_prediction.iloc[:, 3:].sum(axis=1) / len(classifier)

# display(count_prediction)

## True Prediction

In [None]:
positive_prediction = count_prediction[count_prediction['Prediction Rate'] >= 0.5]
true_positive_prediction = positive_prediction[positive_prediction['value'] == 1]

# display(true_positive_prediction)

negative_prediction = count_prediction[count_prediction['Prediction Rate'] < 0.5]
true_negative_prediction = negative_prediction[negative_prediction['value'] == 0]

# display(false_negative_prediction)

true_prediction = pd.concat([true_positive_prediction, true_negative_prediction])

display(true_prediction)

print('Shape: ', true_prediction.shape)

# true_prediction.to_csv(f'TruePrediction-{int(num) + 1}.csv')

In [None]:
good_dataset = df.loc[true_prediction['index'].tolist(), :]

display(good_dataset)

if export_good_bad_dataset:
    good_dataset.to_csv(f'GoodDataset-{int(num) + 1}.csv', index=False)

## False Prediction

In [None]:
positive_prediction = count_prediction[count_prediction['Prediction Rate'] >= 0.5]
false_positive_prediction = positive_prediction[positive_prediction['value'] == 0]

# display(true_positive_prediction)

negative_prediction = count_prediction[count_prediction['Prediction Rate'] < 0.5]
false_negative_prediction = negative_prediction[negative_prediction['value'] == 1]

# display(false_negative_prediction)

false_prediction = pd.concat([false_positive_prediction, false_negative_prediction])

display(false_prediction)

print('Shape: ', false_prediction.shape)

# false_prediction.to_csv(f'FalsePrediction-{int(num) + 1}.csv')

In [None]:
bad_dataset = df.loc[false_prediction['index'].tolist(), :]

display(bad_dataset)

if export_good_bad_dataset:
    bad_dataset.to_csv(f'BadDataset-{int(num) + 1}.csv', index=False)