# Libaries

In [276]:
#Data Structures and Utilities
import numpy as np
import pandas as pd
import time
import os


# Learning evaluation
from  sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

#Algorithms
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope


# Definitions

In [83]:
np.seed = 42 #seed for random selections

#mapping for algorithm name and the class of the algorithm
algorithms = {
    'LocalOutlierFactor': LocalOutlierFactor, 
    'OneClassSVM': OneClassSVM, 
    'IsolationForest': IsolationForest,
    'EllipticEnvelope': EllipticEnvelope
}

# Functions

In [259]:
#Split the dataset into train and test data

# split_type: type os splitting of the examples from the interest class: "cross-validation" or "random"
# number: number of folds in case of type == "cross-validation", or number or examples in case of type == "random"
def get_indexes(data, split_type, number_trials, number_examples): 
    indexes = []
    if split_type == 'cross-validation': 
        kf = KFold(n_splits=number_trials, shuffle=True)
        for ids_train, ids_test in kf.split(data):
            indexes_train = data[ids_train]
            indexes.append(indexes_train)
    elif split_type == 'random':
        for it in range(number_trials):
            indexes.append(np.random.choice(data, size=number_examples, replace=False))
    else:
        raise ValueError('Unsuported split type. Please, use split_type = {"cross-validation","random"}.')
    return indexes

In [260]:
def get_train_test_data(X, all_indexes, indexes_train):
    indexes_test = list(set(all_indexes) - set(indexes_train))
    return X[indexes_train], X[indexes_test]

In [261]:
def get_classes_test(y, classe, all_indexes, indexes_train): 
    indexes_test = list(set(all_indexes) - set(indexes_train))
    y_test = np.ones(len(indexes_test), dtype=np.int)
    for i, element in enumerate(y[indexes_test]): 
        if element != classe: 
            y_test[i] = -1
    return y_test

In [262]:
def get_evaluation_metrics(classifier, X_test, y_test, classe, it_number, model_building_time=0): 
  
  evaluation = {} 
  start_time_classification = time.time()
  predictions = classifier.predict(X_test)
  elapsed_time_classification = (time.time() - start_time_classification) / 1000
  evaluation['Algorithm'] = classifier.__class__.__name__
  evaluation['Parameters'] = classifier.get_params()
  evaluation['Class'] = classe
  evaluation['It_Number'] = it_number
  evaluation['Accuracy'] = accuracy_score(y_test,predictions)
  evaluation['Precision'] = precision_score(y_test,predictions)
  evaluation['Recall'] = recall_score(y_test,predictions)
  evaluation['F1'] = f1_score(y_test,predictions)
  evaluation['ROC_AUC'] = roc_auc_score(y_test,predictions,average=None)
  evaluation['Building_Time'] = model_building_time
  evaluation['Confusion_Matrix'] = confusion_matrix(y_test,predictions).tolist()
  evaluation['Classification_Time'] = elapsed_time_classification
  evaluation['Memory'] = sys.getsizeof(classifier) / 1024
  
  return evaluation 

In [263]:
#X: dada
#y: classes
# split_type: type os splitting of the examples from the interest class: "cross-validation" or "random"
#classifier: OCL algorithm
# number_trials: number of folds in case of split_type == "cross-validation", or number or repetitions in case of split_type == "random"
# number_examples: number of labeled_examples if split_type == "random"
def one_class_learning(X, y, classifier, split_type="cross-validation", number_trials=10, number_examples=10): 
    results = []
    all_indexes = set(range(len(X)))
    classes = np.unique(y)
    for classe in classes: 
        classe_indexes = np.argwhere(y == classe).reshape(-1)
        for it, indexes_train in enumerate(get_indexes(classe_indexes, split_type, number_trials, number_examples)):
            X_train, X_test = get_train_test_data(X, all_indexes, indexes_train)
            y_test = get_classes_test(y, classe, all_indexes, indexes_train)
            classifier.fit(X_train)
            results.append(get_evaluation_metrics(classifier, X_test, y_test, classe, it, model_building_time=0))
    return results


In [282]:
def get_data_frame(path_results): 
    results = None 
    if (os.path.exists(path_results)):
        results = pd.read_csv(path_results)
    else: 
        results = pd.DataFrame(columns=['Algorithm',
            'Parameters',
            'Class',
            'It_Number',
            'Accuracy',
            'Precision',
            'Recall',
            'F1',
            'ROC_AUC',  
            'Building_Time',
            'Confusion_Matrix',
            'Classification_Time',
            'Memory'
        ])
    return results

In [291]:
def execute_exp(X, y, classifier, config): 
    if 'path_results' not in config: 
        raise ValueError('Config file must be a "path_result" entry')
    if 'path_dataset' not in config: 
        raise ValueError('Config file must be a "path_dataset" entry')
    if 'algorithms' not in config: 
        raise ValueError('Config file must be a "algorithm" entry')
    if len(config['algorithms']) == 0:
        raise ValueError('At least one algorhtm should be specified')
    
    results = get_data_frame(config['path_results'])

    current_results = one_class_learning(X, y, classifier, split_type="cross-validation", number_trials=10,      number_examples=10)
    print(current_results)
    results = results.append(current_results, ignore_index=True)
    results.to_csv(config['path_results'], index=False)


# Área de Testes

In [285]:
config = {
    'path_dataset': '/home/rafael/Downloads/iris.csv',
    'path_results': '/home/rafael/Área de Trabalho/Projetos/TextCategorizationToolPython/saida/resultados_teste.csv',
    'split_type': 'random',
    'algorithms': [
        {
            'name': 'LocalOutlierFactor',
            'parameters': {}
        }
    ]
}

In [292]:
execute_exp(X,y,classifier,config)

[{'Algorithm': 'LocalOutlierFactor', 'Parameters': {'algorithm': 'auto', 'contamination': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 20, 'novelty': True, 'p': 2}, 'Class': 'Setosa', 'It_Number': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'ROC_AUC': 1.0, 'Building_Time': 0, 'Confusion_Matrix': [[100, 0], [0, 5]], 'Classification_Time': 9.675025939941406e-07, 'Memory': 0.0546875}, {'Algorithm': 'LocalOutlierFactor', 'Parameters': {'algorithm': 'auto', 'contamination': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 20, 'novelty': True, 'p': 2}, 'Class': 'Setosa', 'It_Number': 1, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'ROC_AUC': 1.0, 'Building_Time': 0, 'Confusion_Matrix': [[100, 0], [0, 5]], 'Classification_Time': 8.189678192138672e-07, 'Memory': 0.0546875}, {'Algorithm': 'LocalOutlierFactor', 'Parameters': {'algorithm': 'auto', 'cont

In [None]:
df = pd.read_csv('/home/rafael/Downloads/iris.csv')
data = df.to_numpy()


In [244]:
classifier.__class__.__name__

'LocalOutlierFactor'

In [242]:
teste= str(classifier.__class__)

In [243]:
teste

"<class 'sklearn.neighbors._lof.LocalOutlierFactor'>"

In [8]:
X = data[:,:-1]
y = data[:,-1]

In [268]:
algorithm = 'LocalOutlierFactor'
classifier = algorithms[algorithm]( novelty=True)
classifier.name = algorithm
#classifier = OneClassSVM()

In [16]:
kf = KFold(n_splits=3)

In [76]:
params['metric'] = 'euclidean'
params

{'algorithm': 'auto',
 'contamination': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 20,
 'novelty': True,
 'p': 2}

In [79]:
classifier.set_params(**params)

LocalOutlierFactor(algorithm='auto', contamination='auto', leaf_size=30,
                   metric='euclidean', metric_params=None, n_jobs=None,
                   n_neighbors=20, novelty=True, p=2)

In [80]:
classifier.get_params()

{'algorithm': 'auto',
 'contamination': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 20,
 'novelty': True,
 'p': 2}

In [117]:
a = np.array([1,2,3,4,5,6,7,8,9,10])
np.random.choice(a, size=9, replace=False)

array([ 9, 10,  6,  2,  8,  1,  5,  3,  4])

In [None]:
roc_auc_score(y_test, predictions, average=None)