# Dry Beans
Regarding Pre-Processing, it is necessary to select:
 - the best scaling strategy, necessary pre-processing step for the SVM classifier;
 - the best feature selection/feature extraction method. Try with:
    - PCA (feature extraction for dimensionality reduction);
    - Filter methods such as SelectKBest, SelectPercentile. The metric to use should be the f_classif (ANOVA correlation since attributes are numerical and targets are categorical).
- the best way to balance out the classes:
    - Oversampling;
    - Undersampling;
    - built-in methods used by SVC class.
- the best way to remove the outliers.
    
Regarding the classification, it is necessary to select:
 - the kernel function of the SVM (linear, polynomial, rbf, sigmoid) and the respective parameters (e.g. gamma for the rbf);
 - the regulatization parameter;
 - the multiclassification method (ovo or ova);
 
 Regarding the training procedure, it is necessary to select:
  - the best train\test split

In [None]:
# LIBRARIES

import numpy as np
import pandas as pd

from sklearn.feature_selection import mutual_info_classif

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import warnings

from mpl_toolkits.mplot3d import Axes3D
from pandas._libs import json
import sklearn.decomposition as dec
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.exceptions import UndefinedMetricWarning
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score, plot_confusion_matrix

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
import sklearn.metrics as met
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline 

%matplotlib inline

## Functions

In [None]:
def class_balancing(X_train, y_train, method):
    
    if method == 'smote':
        sampler = SMOTE()
    elif method == 'adasyn':
        sampler = ADASYN()
    elif method == 'smote_tomek':
        sampler = SMOTETomek()
    elif method == 'smote_enn':
        sampler = SMOTEENN()
    
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled

In [None]:
def naive_classification(df):

    df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)
    df_traindata, df_trainlabel = df_train.iloc[:, 0:len(df_train.columns) - 1], df_train.iloc[:, -1]
    df_testdata, df_testlabel = df_test.iloc[:, 0:len(df_test.columns) - 1], df_test.iloc[:, -1]

    warnings.filterwarnings('ignore')
    # Baseline - comparing model accuracy using all features across classifiers
    classifiers = [
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        KNeighborsClassifier(),
        SVC(),
        GaussianNB(),
        LogisticRegression(),
        MLPClassifier()
    ]

    # Naive Train Accuracy
    algo = ['DTC', 'RFC', 'KNN', 'SVC', 'GNB', 'LR', 'MLP']
    # algo = []
    scores = []
    for clf in classifiers:
        # algo.append(clf.__class__.__name__)
        scores.append(cross_val_score(clf, df_traindata, df_trainlabel, cv=5).mean())
    warnings.filterwarnings('ignore')
    Naivescore_df_Train = pd.DataFrame({'Algorithm': algo, 'Score': scores}).set_index('Algorithm')

    # Naive Test Accuracy
    algo = ['DTC', 'RFC', 'KNN', 'SVC', 'GNB', 'LR', 'MLP']
    # algo = []
    scores = []

    for clf in classifiers:
        clf = clf.fit(df_traindata, df_trainlabel)
        y_pred = clf.predict(df_testdata)
        # algo.append(clf.__class__.__name__)
        scores.append(accuracy_score(y_pred, df_testlabel))
    warnings.filterwarnings('ignore')
    Naivescore_df_Test = pd.DataFrame({'Algorithm': algo, 'Score': scores}).set_index('Algorithm')

    # Bar plot between Train and Test Accuracy
    fig = plt.figure(figsize=(5, 5))  # Create matplotlib figure

    ax = fig.add_subplot(111)  # Create matplotlib axes
    ax2 = ax.twinx()  # Create another axes that shares the same x-axis as a
    width = .3

    Naivescore_df_Train.Score.plot(kind='bar', color='green', ax=ax, width=width, position=0)
    Naivescore_df_Test.Score.plot(kind='bar', color='red', ax=ax2, width=width, position=1)

    ax.grid(None, axis='y')
    ax2.grid(None)

    ax.set_ylabel('Train')
    ax2.set_ylabel('Test')

    ax.set_xlim(-1, 6)

In [None]:
def model_grid_search(X_train, X_test, y_train, y_test, scores, model_name, model, model_parameters_grid, n_cv = 5):
    
    # Creates a pipeline with the selected method
    pipe = Pipeline([['over', SMOTE()], [model_name, model]])  # [None, SMOTE(), ADASYN(), SMOTETomek(), SMOTEENN()]
    
    # Creates an empty dictionary with best parameters for every scoring parameter considered
    best_parameters = {key: None for key in scores}

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(24,20), dpi=100, squeeze=False)
    fig.suptitle("Hyperparameters selection results: " + model_name.upper(), fontsize=28)

    print("\n================================================================================================================================================================\n")
    print("GRID SEARCH FOR " + model_name.upper() + ':')

    # Cross-validation **for every score!**
    for score, ax in zip(scores, ax.reshape(-1)):

        print("\n ---> Tuning " + model_name.upper() + " hyper-parameters for %s" % score.upper() + ": ")
        print()

        clf = GridSearchCV(
            estimator = pipe,
            param_grid = model_parameters_grid,
            scoring = score,
            n_jobs = -1,
            verbose = 4,
            return_train_score = True,
            cv = n_cv
            )

        # ignore divide-by-zero warnings, these occur inevitably in the parameter estimation phase and are annoying
        with warnings.catch_warnings():
            warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
            # .. your divide-by-zero code ..
            clf.fit(X_train, y_train)


        # PLOTTING CROSS VALIDATION RESULTS:

        print("\n ---> Grid scores on development set:\n")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("     ---> %r: %0.3f (+/-%0.03f) for %r" % (score, mean, std * 2, params))

        # FINAL CLASSIFICATION ON TEST SET WITH BEST PARAMETERS (THAT MAXIMISE THE CURRENT SCORE)
        
        print("\n ---> CLASSIFICATION ON THE TEST SET - Detailed classification report:")
        print("     --- The model is trained on the full development set.")
        print("     --- The scores are computed on the full evaluation set.\n")
        
        # PLOT RESULTS

        # ignore divide-by-zero warnings, these occur inevitably in the parameter estimation phase and are annoying
        with warnings.catch_warnings():
            warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
            # .. your divide-by-zero code ..
            y_true, y_pred = y_test, clf.predict(X_test)
            print(classification_report(y_true, y_pred))
        print()

        print("\n ---> BEST PARAMETERS SET found on development set:\n")
        print(clf.best_params_)
        best_parameters[score] = clf.best_params_

        print("\n------------------------------------------------------------------------------------------------------------------------------------------------------------------\n")
        
        # PLOT CONFUSION MATRIX

        cm = confusion_matrix(y_test, y_pred, normalize='true')
        sns.heatmap(cm, annot=True, fmt='.2%', cmap='icefire', ax=ax)
        ax.set_title('Grid search results obtained trying to maximise for: ' + score.upper())

        # set x-axis label and ticks. 
        ax.set_xlabel("Predicted Class", fontsize=14)
        ax.xaxis.set_ticklabels(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])

        # set y-axis label and ticks
        ax.set_ylabel("Actual Class", fontsize=14)
        ax.yaxis.set_ticklabels(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])

        
    return best_parameters

In [None]:
def pipeline_final_traintest(pipe, X_train, X_test, y_train, y_test, best_parameters, selected_score):
    
    # Print the best parameters configurations that maximise specific metric/scores
    print("List of the parameters used according to a specific scoring parameter:")
    print(json.dumps(best_parameters, indent=4))
    print()

    # # Add a specific parameter to the configuration in order to show iterations
    # for i in best_parameters:
    #     best_parameters[i]['verbose'] = 0
    #     # More info on the 'verbose' parameter:
    #     # The verbosity level: if non zero, progress messages are printed. Above 50, the output is sent to stdout.
    #     # The frequency of the messages increases with the verbosity level. If it more than 10, all iterations are
    #     # reported.
    #     # verbose: 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch

    # Final classification with optimized parameters:
    print("Classification:")
    print("Parameters set in order to maximise the %s scoring parameter" % selected_score.upper())

    # Instantiate a new classifier object with best parameters for a selected scoring parameter
    clf = pipe
    clf.set_params(**best_parameters[selected_score])
    # Alternative: creating a new classifier object with manually selected parameters, not optimized
    # clf = SVC(kernel='rbf', decision_function_shape='ovo', verbose=2)

    # Model training:
    print("Model training:")
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
        # .. your divide-by-zero code ..
        clf.fit(X_train, y_train)

    # Model testing:
    print("\n\nMaking predictions:")
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
        # .. your divide-by-zero code ..
        y_predicted = clf.predict(X_test)

        # Performance evaluation
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_predicted))
        print()

        print(classification_report(y_test, y_predicted))
        print()

    
    # PLOT CONFUSION MATRIX

    fig = plt.figure(figsize=(12, 10))  # figure size is given as a (width, height) tuple
    ax1 = fig.add_subplot(111)
    cm = confusion_matrix(y_test, y_predicted, normalize='true')
    sns.heatmap(cm, annot=True, fmt='.2%', cmap='icefire', ax=ax1)
    ax1.set_title('Result obtained with parameters maxmimizing: ' + selected_score.upper())

    # set x-axis label and ticks. 
    ax1.set_xlabel("Predicted Class", fontsize=14)
    ax1.xaxis.set_ticklabels(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])

    # set y-axis label and ticks
    ax1.set_ylabel("Actual Class", fontsize=14)
    ax1.yaxis.set_ticklabels(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])
    
    return clf

## Main

### Importing the dataset

In [None]:
# Import dataset
df = pd.read_excel("DryBeanDataset/Dry_Bean_Dataset.xlsx")

## Pre-processing

In [None]:
X = df[df.columns.tolist()[:-1]]
y = df["Class"]

### Scaling

In [None]:
# Standardizing the features
# X = MinMaxScaler().fit_transform(X)
X = StandardScaler().fit_transform(X)

### PCA

In [None]:
# Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. 
# The input data is centered but not scaled for each feature before applying the SVD.

pca = PCA()

principalComponents = pca.fit_transform(X)

pca_df = pd.DataFrame(data = principalComponents
             , columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16'])

pca_df = pd.concat([pca_df, df[['Class']]], axis = 1)

## Classification

**ATTENTION**: Class Balancing cannot be done a priori since we perform K-Fold Cross Validation!

### Train test split

In [None]:
# Train test split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.3, random_state=42)

### Naive classification

In [None]:
# naive_classification(df)

### Hyperparameter selection (grid search and k-fold cross-validation)

In [None]:
# Hyper-parameter optimization using k-fold cross validation: estimation of the classifier's parameters that maximise specific metrics/scores

models = {
    # "Random_Forest": RandomForestClassifier(
    #     min_samples_leaf=1, max_depth = None, bootstrap = True, 
    #     oob_score = False, min_samples_split = 2,
    #     ),
    # "Extra_Trees": ExtraTreesClassifier(
    #     min_samples_leaf=1, max_depth = None, bootstrap = False, 
    #     oob_score = False, min_samples_split = 2,
    #     ),
    # "KNN": KNeighborsClassifier(
    #     ),
    # "SVM": SVC(
    #     ),
    "MLP": MLPClassifier( 
        alpha=1e-5, 
        learning_rate='constant',
        max_iter=1000,
        tol=1e-4,
        verbose=False,
        momentum=0.9,
        early_stopping=False,
        )
}

# Set the parameters by cross-validation for different models
models_parameters_grids = {
    # "Random_Forest":
    #     {
    #         "Random_Forest__n_estimators": [256], 
    #         "Random_Forest__criterion": ["gini"],                    # {"gini", "entropy", "log_loss"}
    #         "Random_Forest__max_features": ["sqrt"],                 # {"sqrt", "log2", None}
    #     },
    # "Extra_Trees":   
    #     {
    #         "Extra_Trees__n_estimators":  [32], 
    #         "Extra_Trees__criterion": ["gini"],                    # {"gini", "entropy", "log_loss"}
    #         "Extra_Trees__max_features": ["sqrt"],                 # {"sqrt", "log2", None}
    #     },
    # "KNN": 
    #     {
    #         'KNN__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #         'KNN__n_neighbors': [5, 10, 15],
    #     },
    # "SVM": 
    #     [
    #         {
    #             'SVM__kernel': ['rbf'],
    #             'SVM__gamma': [0.05, 0.1],
    #             'SVM__C': [100],
    #             'SVM__decision_function_shape': ['ovo'],  # by default this is set to ovr
    #         },   
    
    #         {   
    #             'SVM__kernel': ['linear'],
    #             'SVM__C': [1, 10, 100, 1000],
    #             'SVM__decision_function_shape': ['ovo'],
    #         }
    #     ],
    "MLP":
            {
                'MLP__activation': ['identity', 'logistic', 'tanh', 'relu'],    # {identity', 'logistic', 'tanh', 'relu'}
                'MLP__solver': ['adam',],     # {'lbfgs', 'sgd', 'adam'}
                'MLP__hidden_layer_sizes': [(12, 3), (16, 16), (256, 256), (16, 16, 16, 16)], 
                'MLP__learning_rate_init': [0.3, 0.1, 0.01],
            }
}

# Choose the metrics to optimize for
# scores = ['accuracy']
scores = ['accuracy']

model_results  = {
    # "Random_Forest": {},
    # "Extra_Trees": {},
    # "KNN": {},
    # "SVM": {},
    "MLP": {}
}

for model_name in models:
# Create pipeline
    model_best_parameters = model_grid_search(X_train, X_test, y_train, y_test, scores, model_name, models.get(model_name), models_parameters_grids.get(model_name), n_cv=5)
    model_results[model_name] = model_best_parameters

In [55]:
model_results

{'MLP': {'accuracy': {'MLP__activation': 'logistic',
   'MLP__hidden_layer_sizes': (16, 16),
   'MLP__learning_rate_init': 0.01,
   'MLP__solver': 'adam'}}}

In [None]:
# Final classification

# selection of a particular configuration: this is done selecting the relative metric
selected_score = "accuracy"

best_parameters = 

svm_clf = pipeline_final_traintest(pipe, X_train, X_test, y_train, y_test, best_parameters, selected_score)

## Small tests

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.svm import SVC

# # Create a feature-selection transform, a scaler and an instance of SVM that we
# # combine together to have a full-blown estimator

# clf = Pipeline(
#     [
#         ("anova", SelectPercentile(f_classif)), # Metric is ANOVA
#         ("scaler", MinMaxScaler()),
#         ("svc", SVC(gamma="auto")),
#     ]
# )

In [None]:
# # This takes 2 minutes

# import matplotlib.pyplot as plt

# from sklearn.model_selection import cross_val_score

# score_means = list()
# score_stds = list()
# percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

# for percentile in percentiles:
#     clf.set_params(anova__percentile=percentile)
#     this_scores = cross_val_score(clf, X, y)
#     score_means.append(this_scores.mean())
#     score_stds.append(this_scores.std())

# plt.errorbar(percentiles, score_means, np.array(score_stds))
# plt.title("Performance of the SVM-Anova varying the percentile of features selected")
# plt.xticks(np.linspace(0, 100, 11, endpoint=True))
# plt.xlabel("Percentile")
# plt.ylabel("Accuracy Score")
# plt.axis("tight")