In [1]:
import pandas as pd
import numpy as np
import time
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from data_reduction.ranking import getPHOutlierScores_restrictedDim as phl_selection
from data_reduction.representativeness import find_epsilon

import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
from my_dataset_reduction.phl import getPHOutlierScores_restrictedDim as my_phl_selection, estimate_delta

dbpath = '../datasets/dry+bean+dataset/DryBeanDataset/Dry_Bean_Dataset.xlsx'
target = 'Class'

SEED = 2025
np.random.seed(SEED)

PROFILING = False # False para tomar todo el dataset
results_folder = '../results/compare'

# Check if the results folder exists, if not create it
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

TOL = 1e-2 # Tolerancia para ver si los scores son iguales

In [2]:
df = pd.read_excel(dbpath, )

pd.set_option('display.max_columns', None)
print("Tamaño del dataset: ", df.shape)
df.describe()

Tamaño del dataset:  (13611, 17)


Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
count,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0,13611.0
mean,53048.284549,855.283459,320.141867,202.270714,1.583242,0.750895,53768.200206,253.06422,0.749733,0.987143,0.873282,0.799864,0.006564,0.001716,0.64359,0.995063
std,29324.095717,214.289696,85.694186,44.970091,0.246678,0.092002,29774.915817,59.17712,0.049086,0.00466,0.05952,0.061713,0.001128,0.000596,0.098996,0.004366
min,20420.0,524.736,183.601165,122.512653,1.024868,0.218951,20684.0,161.243764,0.555315,0.919246,0.489618,0.640577,0.002778,0.000564,0.410339,0.947687
25%,36328.0,703.5235,253.303633,175.84817,1.432307,0.715928,36714.5,215.068003,0.718634,0.98567,0.832096,0.762469,0.0059,0.001154,0.581359,0.993703
50%,44652.0,794.941,296.883367,192.431733,1.551124,0.764441,45178.0,238.438026,0.759859,0.988283,0.883157,0.801277,0.006645,0.001694,0.642044,0.996386
75%,61332.0,977.213,376.495012,217.031741,1.707109,0.810466,62294.0,279.446467,0.786851,0.990013,0.916869,0.83427,0.007271,0.00217,0.696006,0.997883
max,254616.0,1985.37,738.860153,460.198497,2.430306,0.911423,263261.0,569.374358,0.866195,0.994677,0.990685,0.987303,0.010451,0.003665,0.974767,0.999733


In [3]:
if PROFILING:
    df = df.sample(1000, random_state=SEED)

In [4]:
# Train test split
X = df.drop(columns=[target])
X = np.array(X)

# Convertir la columna target a entero
if not pd.api.types.is_integer_dtype(df[target]):
    le = LabelEncoder()
    y = le.fit_transform(df[target])
else:
    y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2025)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Get number of super-outliers and mean neighbors with k=3 and with delta=0.05
from my_dataset_reduction.phl import get_mean_neighbors, estimate_delta, get_super_outliers
print(f"Dry Bean dataset size: {len(X)}")
print(f"Train dataset size: {len(X_train_scaled)}")
k = 3
delta = estimate_delta(X_train_scaled, y_train, k)
print(f"Delta estimated: {delta}")
# Get the number of super-outliers
mean_neighbors = get_mean_neighbors(X_train_scaled, y_train, delta)
print(f"Mean neighbors with k={k} and delta={delta}: {mean_neighbors}") 
# Get the number of super-outliers
num_super_outliers = get_super_outliers(X_train_scaled, y_train, delta)
print(f"Number of super-outliers with k={k} and delta={delta}: {num_super_outliers}")

k = 0
delta = 0.05
# minmax scaling
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train_scaled_2 = minmax_scaler.fit_transform(X_train)

# Get the number of super-outliers
mean_neighbors = get_mean_neighbors(X_train_scaled_2, y_train, delta)
print(f"Mean neighbors with the approach of SGAI article: {mean_neighbors}") 
# Get the number of super-outliers
num_super_outliers = get_super_outliers(X_train_scaled_2, y_train, delta)
print(f"Number of super-outliers with the approach of SGAI article: {num_super_outliers} out of {len(X)}")

Dry Bean dataset size: 13611
Train dataset size: 10888
Delta estimated: 0.5243319586361499
Mean neighbors with k=3 and delta=0.5243319586361499: 8.996877296105804
Number of super-outliers with k=3 and delta=0.5243319586361499: 4688
Mean neighbors with the approach of SGAI article: 2.565025716385011
Number of super-outliers with the approach of SGAI article: 6723 out of 13611


In [7]:
reduction_methods = ['PHL', 'MY_PHL']
reps = 1 #2
deltas = [0.05, 0.1, 0.25]
metrics = ['time']

df_compare_phl = pd.DataFrame(columns=['delta', 'k'] + ['time_' + m for m in reduction_methods])

if os.path.exists(results_folder + '/compare_phl.csv'):
    df_compare_phl = pd.read_csv(results_folder + '/compare_phl.csv')
else:
    for delta in deltas:
        for n in range(reps):
            print(f"Reducing dataset delta: {delta}, iteration: {n}")
            # Reduce the dataset using PHL method
            t0 = time.time()
            outlier_scores_point_cloud_original_order,set_of_super_outliers, super_outlier_indices = phl_selection(X_train_scaled, delta, 1)
            reduction_time = time.time() - t0
            print(f"PHL: {reduction_time:.2f} seconds")

            # Reduce the dataset using my PHL method
            t0 = time.time()
            my_outlier_scores_point_cloud_original_order,my_set_of_super_outliers, my_super_outlier_indices = my_phl_selection(X_train_scaled, delta, 1)
            reduction_time_my = time.time() - t0

            print(f"MY_PHL: {reduction_time_my:.2f} s")

            # Store the results
            df_compare_phl = df_compare_phl.append({
                        'delta': delta,
                        'k': 'N/A',
                        'time_PHL' : reduction_time,
                        'time_MY_PHL' : reduction_time_my,
                        'equal' : np.allclose(outlier_scores_point_cloud_original_order, my_outlier_scores_point_cloud_original_order, atol=TOL),
                    }, ignore_index=True)
    # Save the results
    df_compare_phl.to_csv(f'{results_folder}/phl_results.csv', index=False)

Reducing dataset delta: 0.05, iteration: 0
PHL: 40.06 seconds
MY_PHL: 3.35 s
Reducing dataset delta: 0.1, iteration: 0




PHL: 39.85 seconds
MY_PHL: 2.73 s
Reducing dataset delta: 0.25, iteration: 0




PHL: 40.79 seconds
MY_PHL: 3.03 s


In [8]:
df_compare_phl

Unnamed: 0,delta,k,time_PHL,time_MY_PHL,equal
0,0.05,,40.056248,3.351258,1.0
1,0.1,,39.854993,2.725599,1.0
2,0.25,,40.791791,3.032668,1.0


In [9]:
k_values = [2, 3, 5, 7, 10]
for k in k_values:
            print(f"Reducing dataset k: {k}, iteration: {n}")
            # Estimate delta for the given k
            t0 = time.time()
            delta = estimate_delta(X_train_scaled, y_train, k)
            delta_time = time.time() - t0
            # Reduce the dataset using PHL method
            t0 = time.time()
            outlier_scores_point_cloud_original_order,set_of_super_outliers, super_outlier_indices = phl_selection(X_train_scaled, delta, 1)
            reduction_time = time.time() - t0
            print(f"PHL: {reduction_time:.2f} seconds")

            # Reduce the dataset using my PHL method
            t0 = time.time()
            my_outlier_scores_point_cloud_original_order,my_set_of_super_outliers, my_super_outlier_indices = my_phl_selection(X_train_scaled, delta, 1)
            reduction_time_my = time.time() - t0

            print(f"MY_PHL: {reduction_time_my:.2f} s")

            # Store the results
            df_compare_phl = df_compare_phl.append({
                        'delta': delta,
                        'k': k,
                        'time_PHL' : reduction_time,
                        'time_MY_PHL' : reduction_time_my,
                        'equal' : np.allclose(outlier_scores_point_cloud_original_order, my_outlier_scores_point_cloud_original_order, atol=TOL),
                    }, ignore_index=True)

Reducing dataset k: 2, iteration: 0




PHL: 42.38 seconds
MY_PHL: 4.15 s
Reducing dataset k: 3, iteration: 0




PHL: 43.10 seconds
MY_PHL: 4.84 s
Reducing dataset k: 5, iteration: 0




PHL: 49.64 seconds
MY_PHL: 7.31 s
Reducing dataset k: 7, iteration: 0




PHL: 59.09 seconds
MY_PHL: 10.00 s
Reducing dataset k: 10, iteration: 0




PHL: 76.86 seconds
MY_PHL: 15.45 s


In [10]:
df_compare_phl

Unnamed: 0,delta,k,time_PHL,time_MY_PHL,equal
0,0.05,,40.056248,3.351258,1.0
1,0.1,,39.854993,2.725599,1.0
2,0.25,,40.791791,3.032668,1.0
3,0.479215,2.0,42.378593,4.147586,1.0
4,0.524332,3.0,43.09822,4.835766,1.0
5,0.585827,5.0,49.635712,7.310532,1.0
6,0.627999,7.0,59.092936,10.000647,1.0
7,0.6753,10.0,76.858532,15.446491,1.0
