In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix

In [2]:
def splitDataset(dataset, trainRatio):
    trainSize = int(len(dataset) * trainRatio)
    trainSet = []
    copy = list(dataset.iloc[:,:].values)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [np.array(trainSet), np.array(copy)]

def calc_centroids(dataset, labels):
    dataset=pd.DataFrame(dataset)
    centroids=[]
    for label in labels:
        centroids.append((list((dataset[dataset['Target']==label].iloc[:,:-1]).sum()/len(dataset[dataset['Target']==label])),label))
    return(centroids)

def euc_distance(lista, x):
    if(len(lista) == len(x)):
        n=len(lista)-1
    else:
        return('Os conjuntos devem possuir a mesma dimensão!')
    dist = 0
    for i in range(n):
        dist += (lista[i]-x[i])**2
    return(np.sqrt(dist))

def calc_distances(centroids, amostra):
    distances=[]
    for i in range(len(centroids)):
        dist = euc_distance(centroids[i][0],amostra)
        distances.append((dist,centroids[i][-1]))
    return(distances)

def att_class(centroids,amostra):
    distances=calc_distances(centroids,amostra)
    distances.sort()
    return(distances[0][1])

def make_pred(dataset,amostras):
    labels = list(dataset.iloc[:,-1].unique())
    centroids=calc_centroids(dataset, labels)
    preds=[]
    for amostra in amostras:
        preds.append(att_class(centroids, amostra))
    return(preds)

In [3]:
def prepare_to_remove(dataset):
    """
    Calcula algumas estatísticas para a remoção de outliers do dataset.
    
    Input
    ----------
    dataset: 
    Dataset a ser preparado para remoção de outliers.
    
    Output
    ----------
    cols:
    Lista contendo os nomes das colunas dos datasets.
  
    means:
    Lista contendo as médias das colunas dos datasets.
  
    stds:
    Lista contendo os desios padrões das colunas dos datasets.
  
    """
    cols, means, stds = [],[],[]
    for col in dataset.columns:
        try:
            cols.append(col)
            means.append(dataset[col].mean())
            stds.append(dataset[col].std())
        except TypeError:
            means.append(np.nan)
            stds.append(np.nan)
            print(f'Coluna {col} possui valores em formato não númerico!')
    return(cols, means, stds)

def remove_outliers(dataset):
    """
    Soma todos os elementos de um vetor, exceto o que está na posição n.
    
    
    Input
    ----------
    dataset: 
    Dataset para remoção de outliers.
    
    Output
    ----------
    dataset:
    O dataset da entrada, agora com os outliers removidos
    
    
    Obs.: São consideradas outliers da variável x, observações além do intervalo: mean(x) ± 2*std(x).
    """
    cols, means, stds = prepare_to_remove(dataset)
    k=0
    for col in cols:
        dataset = dataset.loc[(dataset[col] > means[k]-2*stds[k]) & (dataset[col] < means[k]+2*stds[k])]
        k=k+1
    return(dataset)


In [4]:
data=pd.read_csv('https://raw.githubusercontent.com/rhanielmx/RecPad/master/messidor_features.csv')
data=remove_outliers(data)
feature_importances=[]
cols_names=[]
for i in range(50):
    train_set, test_set = splitDataset(dataset=data,trainRatio=0.7)
    X_train,y_train=[train_set[i][:-1] for i in range(train_set.shape[0])],[train_set[i][-1] for i in range(train_set.shape[0])]
    X_test,y_test=[test_set[i][:-1] for i in range(test_set.shape[0])],[test_set[i][-1] for i in range(test_set.shape[0])]

    from sklearn.ensemble import RandomForestClassifier 
    rf = RandomForestClassifier() 
    rf.fit(X_train, y_train) 

    feature_importances.append(rf.feature_importances_)

avg_feature_importances=sum(feature_importances)/len(feature_importances)
avg_feature_importances=pd.DataFrame(avg_feature_importances*100,columns=['Average Importance(%)'],index=list(data.columns[:-1]))

print(avg_feature_importances.sort_values('Average Importance(%)',ascending=False))
cols_to_use=[avg_feature_importances.index[i] for i in range(len(avg_feature_importances.index)) if (avg_feature_importances.iloc[i,0]>=(avg_feature_importances).mean()[0])==True]
cols_to_use.append('Target')

                            Average Importance(%)
Exudates_Detection_1                     9.110714
MA_Detection_alpha-0.5                   8.223888
Exudates_Detection_2                     7.987080
OpticDisc_Diameter                       7.615056
Exudates_Detection_3                     7.408684
Macula_OpticDisc_Distance                7.391314
Exudates_Detection_4                     6.877441
MA_Detection_alpha-0.6                   6.244501
MA_Detection_alpha-0.7                   5.792654
MA_Detection_alpha-0.9                   5.583021
MA_Detection_alpha-1.0                   5.501992
MA_Detection_alpha-0.8                   5.181421
Exudates_Detection_5                     4.787482
Exudates_Detection_7                     4.523325
Exudates_Detection_6                     3.853383
Exudates_Detection_8                     2.958127
AM/FM-based classification               0.959918
Pre-Screening                            0.000000
Quality_Assessment                       0.000000


In [5]:
def main():    
    for _ in range(n_rounds):
        data=pd.read_csv(filepath_or_buffer=path,usecols=cols_to_use)
        data=remove_outliers(data)
        
        train_set, test_set = splitDataset(dataset=data,trainRatio=trainRatio)
        X_test=[test_set[i][:-1] for i in range(test_set.shape[0])]
        y_test=[test_set[i][-1] for i in range(test_set.shape[0])]

        preds=make_pred(data,np.array(X_test))
        
        cm = confusion_matrix(y_test, preds)
        sr = 100*(cm.diagonal().sum()/cm.sum())
        confusion_matrixes.append(cm)
        accuracys.append(sr)        
        
        class0_success=100*cm[0][0]/(cm[0][0]+cm[0][1])
        class1_success=100*cm[1][1]/(cm[1][0]+cm[1][1])
         
        class0_successes.append(class0_success)
        class1_successes.append(class1_success)
    
        if ((_+1)%(int(n_rounds/10))==0):
            print(f'{(_+1)*(100/n_rounds):05.2f}% concluído!')

In [10]:
accuracys = []
confusion_matrixes = []
class0_successes,class1_successes=[],[]
trainRatio=0.7
n_rounds=100
path='https://raw.githubusercontent.com/rhanielmx/RecPad/master/messidor_features.csv'

%time main()

10.00% concluído!
20.00% concluído!
30.00% concluído!
40.00% concluído!
50.00% concluído!
60.00% concluído!
70.00% concluído!
80.00% concluído!
90.00% concluído!
100.00% concluído!
Wall time: 1min 7s


In [12]:
Accuracys=pd.DataFrame(data={'Accuracys':accuracys})
Accuracys.describe()

Unnamed: 0,Accuracys
count,100.0
mean,55.286232
std,2.245972
min,49.275362
25%,53.985507
50%,55.434783
75%,56.884058
max,61.594203


In [13]:
Class_Sucesses=pd.DataFrame(data={'Succces 0':class0_successes,'Succces 1':class1_successes})
Class_Sucesses.describe()

Unnamed: 0,Succces 0,Succces 1
count,100.0,100.0
mean,56.988057,53.532724
std,3.188954,3.36886
min,48.507463,46.323529
25%,55.007562,51.384132
50%,56.89688,53.504749
75%,59.027778,56.101948
max,66.165414,62.015504
