In [None]:
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import scipy


# Download and load Dataset

In [None]:
if(not os.path.exists(f'{os.getcwd()}\DryBeanDataset')):
    download_dataset('https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip', '.')
df = load_dataset("DryBeanDataset/Dry_Bean_Dataset.xlsx")
df = df['Dry_Beans_Dataset']

# Visualize Dataset

In [None]:
df.head(1000)

In [None]:
print(df.shape, "\n\n", df.columns, "\n\n", df["Class"].value_counts())

# Setting The Data

In [None]:
df['Class']

In [None]:
enconding, i = {}, 0
for label in df["Class"].unique():
    enconding[label] = i
    i=i+1

print(enconding)
df.Class.replace(enconding, inplace=True)
df['Class']

O dataset possui classes desbanlanceadas, é esperado que o modelo perfome bem para a classe DERMASON e não tão bem para o BOMBAY.

In [None]:
X = df[['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4']]
y = df['Class']
y_labels = list(enconding.keys()) #y_labels = ['DERMASON','SIRA','SEKER','HOROZ','CALI','BARBUNYA','BOMBAY']

# Define MLP

In [None]:
def MLP(layers, dropout=False):

    model = Sequential()
    model.add(Flatten(input_shape=(layers[0],)))
    model.add(Dense(units=layers[1], input_shape=(layers[0],), activation='tanh'))
    for layer in layers[2::]:
        model.add(Dense(units=layer, activation='tanh'))
    if dropout:
        model.add(Dropout(.2))
    model.add(Dense(units=7, activation='softmax'))

    # model.summary()

    # Train
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Inferência

In [None]:
def inferencia_MLP(X, y, clf, scaler):

  X, y = scaler.transform(X), y
  predictions = clf.predict(X)
  class_predictions = np.argmax(predictions, axis = 1).reshape(-1,1) 
  cm = confusion_matrix(y_true = y, y_pred = class_predictions)
  val_acc = np.trace(cm)/np.sum(cm)*100
    
  return val_acc, cm

In [None]:
def inferencia_Linear(X, y, clf):
    
  class_predictions = clf.predict(X).reshape(-1,1)
  cm = confusion_matrix(y_true = y, y_pred = class_predictions)    
  val_acc = np.trace(cm)/np.sum(cm)*100
  
  return val_acc, cm

# TRAIN MODELS #

In [None]:
def train(X_train,y_train, X_test, y_test, mlp_size):
    classifiers = [
        MLP(mlp_size),
        LDA()
    ]
    names = [
        "MLP",
        "LDA"
    ]
    acc_dict = {
        "MLP": [],
        "LDA": []
    }
    cm_dict = {
        "MLP": [],
        "LDA": []
    }

    # Redução de Dim
    # X_train, X_test = LDA_reduction(X_train, y_train, X_test, 3)

    # Normalize data for MLP
    scaler = MinMaxScaler(feature_range=(0,1))
    X_scaled = scaler.fit_transform(X_train.to_numpy()) ## to_numpy opcional, funciona com df

    for name, clf in zip(names, classifiers):
        if(name!="MLP"):
            clf.fit(X_scaled, y_train)
            val_acc, cm = inferencia_Linear(scaler.transform(X_test), y_test, clf)
            acc_dict[name] = val_acc
            cm_dict[name] = cm        
            # print(f'Acurácia de Test {name}: {val_acc}')
        else:
            mlp_model = clf.fit(
                x=X_scaled,
                y=y_train,
                validation_split=0,
                batch_size=16,
                epochs=1,
                verbose=0)
            val_acc, cm = inferencia_MLP(X_test, y_test, clf, scaler)
            acc_dict[name] = val_acc
            cm_dict[name] = cm     
            # print(f'Acurácia de Test {name}: {val_acc}')
        # print()
    
    return acc_dict, cm_dict

### accs_dict e cms_dict serão um dict com uma lista representando a acuracia e a matrix de confusao para cada teste rodado. Cada lista tem tamanho N_tests

# Test Best Configuration MLP

In [None]:
MLP_multlayer_acc = []
LDA_multlayer_acc = []
for layer1 in range(1,10):
    for layer2 in range(1,10):
        mlp_size = [16,layer1, layer2]
        N_test = 20 ## numero de tests a serem feitos, idealmente é sempre bom ter no minimo 20 amostras.
        accs_dict = { 
            "MLP": [],
            "LDA": []
        }
        cms_dict = {
            "MLP": [],
            "LDA": []
        }

        for i in range(N_test):
            # print(f'Teste {i+1}')
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random.randint(0,1000))
            acc, cm = train(X_train, y_train, X_test, y_test, mlp_size)
            [accs_dict[k].append(acc[k]) for k, v in acc.items()]
            [cms_dict[k].append(cm[k]) for k, v in cm.items()]   

        MLP_acc = accs_dict['MLP'] ## Last configuration saved
        LDA_acc = accs_dict['LDA'] ## Last configuration saved
        MLP_multlayer_acc.append((np.round(np.mean(MLP_acc), 2), np.round(np.std(MLP_acc), 2)))
        print(f'Layer conf: {mlp_size}')
        print(f'MLP: {np.round(np.mean(MLP_acc), 2)} ± {np.round(np.std(MLP_acc), 2)}')
        print(f'LDA: {np.round(np.mean(LDA_acc), 2)} ± {np.round(np.std(LDA_acc), 2)}') 
        print()

## Acurácia média e Desvio padrão

In [None]:
MLP_acc = accs_dict['MLP']
LDA_acc = accs_dict['LDA']

print(f'MLP: {np.round(np.mean(MLP_acc), 2)} ± {np.round(np.std(MLP_acc), 2)}')
print(f'LDA: {np.round(np.mean(LDA_acc), 2)} ± {np.round(np.std(LDA_acc), 2)}')

## Matrix de Confusão resultante

In [86]:
MLP_cm = (np.sum([i for i in cms_dict['MLP']], 0)).astype(int)
name = 'MLP'
Plot_confusion_matrix(cm=MLP_cm, classes=y_labels, title=f'Matriz de Confusão {name}', normalize=False, save_file=(True, name))

Confusion matrix, without normalization
[[4601    4    0    0    3  132  343]
 [ 134 1231   43 1198  294  345    9]
 [   0  154  720  419    0    0    0]
 [   8  177    1 3328  464   73    0]
 [   0    2    0   82 4655   97   63]
 [ 111    2    0    7  381 4099 1992]
 [ 125    0    0    0    7  166 8560]]


In [87]:
LDA_cm = (np.sum([i for i in cms_dict['LDA']], 0)).astype(int)
name = 'LDA'
Plot_confusion_matrix(cm=LDA_cm, classes=y_labels, title=f'Matriz de Confusão {name}', normalize=False, save_file=(True, name))

Confusion matrix, without normalization
[[4649   28    0    0    0  321   85]
 [  23 2710    0  283   14  224    0]
 [   0    0 1291    2    0    0    0]
 [   6   27    0 3859   49  110    0]
 [   0   11    0  111 4571  193   13]
 [  11   15    0    7   50 6181  328]
 [ 123    5    0    0   12 1166 7552]]


# Testes de Significancia #

## Checando a Distribuição

Valido para N_testes de uma mesma amostra

In [None]:
alpha = 0.05

In [None]:
stat, p = scipy.stats.normaltest(MLP_acc)
print('stat=%.3f, p=%.3f' % (stat, p))
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("MLP: Distribuição não normal")
else:
    print("MLP: Distribuição normal")

In [None]:
stat, p = scipy.stats.normaltest(LDA_acc)
print('stat=%.3f, p=%.3f' % (stat, p))
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("LDA: Distribuição não normal") # rejeitando
else:
    print("LDA: Distribuição normal")

## Testando

### Parametricos (NORMAL)

#### T-test

In [None]:
from scipy.stats import ttest_ind
stat, p = ttest_ind(MLP_acc, LDA_acc)
print('stat=%.3f, p=%.5f' % (stat, p))
if p < alpha:
	print('Probably different distributions') # rejeita null
else:
	print('Probably the same distribution')
	

#### ANOVA 

#### Diria que esse é o teste correto para essa situação

In [None]:
from scipy.stats import f_oneway
stat, p = f_oneway(MLP_acc, LDA_acc)
print('stat=%.3f, p=%.5f' % (stat, p))
if p < alpha: ## null hipotesis: x1 and x2 pertencem a mesma distribuição
	print('Probably different distributions') # rejeita null
else:
	print('Probably the same distribution')

### Não parametricos (não NORMAL)

In [None]:
from scipy.stats import mannwhitneyu

stat, p = mannwhitneyu(MLP_acc, LDA_acc)
print('stat=%.3f, p=%.5f' % (stat, p))
if p < alpha: ## null hipotesis: x1 and x2 pertencem a mesma distribuição
	print('Probably different distributions') # rejeita null
else:
	print('Probably the same distribution')