## Detecció de Outliders

In [17]:
import pandas as pd

file = "../res/dataset_final.csv"
df = pd.read_csv(file)
df.set_index('_id', inplace=True)
df

Unnamed: 0_level_0,ano,almacenamiento,marca,pantalla_in,pantalla_tipo,velocidad_cpu_ghz,ram,grosor,peso,ancho_px,alto_px,bateria,promedio_valoraciones,precio_anterior,precio_actual
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
B00IRZ8EQC,2013.0,256.0,samsung,4.30,lcd,1.7,1.0,1.27,77.11,540.0,960.0,5000.0,279.459459,199.99,245.00
B00J8OA220,2024.0,160.0,inmarsat,2.00,lcd,2.9,2.0,10.16,318.00,240.0,320.0,160.0,0.800000,199.99,933.00
B00JC8MD7Y,2024.0,256.0,samsung,6.70,superamoled,2.2,6.0,0.77,179.00,1080.0,2400.0,4500.0,1761.818182,279.99,279.99
B00TKALUDC,2015.0,128.0,ttfone,2.00,lcd,0.0,4.0,1.80,75.00,1080.0,2400.0,800.0,168.571429,45.99,41.99
B00TUXHZTW,2015.0,128.0,doro,2.00,lcd,1.2,12.0,1.90,74.00,3840.0,2160.0,800.0,74.000000,48.67,48.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0DH3N1GZ3,2024.0,128.0,honor,6.56,lcd,22.1,4.0,2.57,254.42,1920.0,1080.0,5000.0,1.702128,199.99,99.90
B09QH71RZR,2024.0,128.0,xiaomi,6.43,amoled,2.0,6.0,0.81,179.00,1920.0,1080.0,5000.0,3821.818182,165.39,127.00
B0DK4NBZWB,2024.0,256.0,oppo,6.67,amoled,3.4,8.0,2.57,254.42,1080.0,2400.0,5000.0,2.291667,199.99,299.00
B0D8TCGKGN,2024.0,256.0,oukitel,6.52,hd,1.6,24.0,2.57,254.42,1280.0,720.0,10600.0,149.772727,199.99,149.99


In [18]:
df.shape[0]

941

In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore, chi2
from sklearn.ensemble import IsolationForest
from scipy.spatial.distance import mahalanobis

# Crear una carpeta para guardar los archivos CSV
ouliders_path = 'outliers_results/outliders'
data_path = 'outliers_results/datasets'
os.makedirs(ouliders_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)

def detect_outliers_iqr(data, column, threshold=1.5):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

def detect_outliers_zscore(data, column, threshold=3):
    data['z_score'] = zscore(data[column])
    outliers = data[abs(data['z_score']) > threshold]
    return outliers.drop(columns=['z_score'])

def detect_outliers_isolation_forest(data, columns, contamination=0.1):
    model = IsolationForest(contamination=contamination, random_state=42)
    data['anomaly'] = model.fit_predict(data[columns])
    outliers = data[data['anomaly'] == -1]
    return outliers.drop(columns=['anomaly'])

def detect_outliers_dbscan(data, columns, eps=0.5, min_samples=5):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[columns])
    model = DBSCAN(eps=eps, min_samples=min_samples)
    data['cluster'] = model.fit_predict(scaled_data)
    outliers = data[data['cluster'] == -1]
    return outliers.drop(columns=['cluster'])

def detect_outliers_mahalanobis(data, columns, threshold=0.95):
    cov_matrix = data[columns].cov().values
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mean = data[columns].mean().values

    def calculate_mahalanobis(row):
        diff = row - mean
        return np.sqrt(np.dot(np.dot(diff, inv_cov_matrix), diff.T))

    data['mahalanobis'] = data[columns].apply(calculate_mahalanobis, axis=1)
    chi2_threshold = chi2.ppf(threshold, df=len(columns))
    outliers = data[data['mahalanobis'] > chi2_threshold]
    return outliers.drop(columns=['mahalanobis'])

def get_outliers_datasets(data, numeric_columns):
    outliers_datasets = {}

    # IQR
    outliers_iqr = pd.DataFrame()
    for col in numeric_columns:
        outliers_iqr = pd.concat([outliers_iqr, detect_outliers_iqr(data, col)])
    outliers_datasets['IQR'] = outliers_iqr.drop_duplicates()

    # Z-Score
    outliers_zscore = pd.DataFrame()
    for col in numeric_columns:
        outliers_zscore = pd.concat([outliers_zscore, detect_outliers_zscore(data, col)])
    outliers_datasets['Z-Score'] = outliers_zscore.drop_duplicates()

    # Isolation Forest
    outliers_if = detect_outliers_isolation_forest(data, numeric_columns)
    outliers_datasets['Isolation Forest'] = outliers_if

    # DBSCAN
    outliers_dbscan = detect_outliers_dbscan(data, numeric_columns)
    outliers_datasets['DBSCAN'] = outliers_dbscan

    # Mahalanobis
    outliers_mahalanobis = detect_outliers_mahalanobis(data, numeric_columns)
    outliers_datasets['Mahalanobis'] = outliers_mahalanobis

    return outliers_datasets

def export_to_csv(outliers_datasets, folder):
    for method, df in outliers_datasets.items():
        file_path = os.path.join(folder, f"{method}_outliers.csv")
        df.to_csv(file_path, index=False)
        print(f"Archivo exportado: {file_path}")

def export_clean_data(data, outliers_datasets, folder):
    for method, outliers_df in outliers_datasets.items():
        # Filtrar el dataset original para excluir los outliers
        clean_data = data[~data.index.isin(outliers_df.index)]
        file_path = os.path.join(folder, f"{method}_clean_data.csv")
        clean_data.to_csv(file_path, index=False)

def show_results(outliers_datasets):
    print("\n--- Panel de Información de Outliers ---")
    for method, df in outliers_datasets.items():
        print(f"\nMétodo: {method}")
        print(f"Número de outliers detectados: {len(df)}")
        print("Resumen de outliers:")

# Ejemplo de uso
data = df.copy()  # Supongamos que `df` es tu dataset original
numeric_columns = data.select_dtypes(include=['float64']).columns
outliers_datasets = get_outliers_datasets(data, numeric_columns)

# Mostrar el panel de información
show_results(outliers_datasets)

# Exportar los datasets de outliers a CSV
export_to_csv(outliers_datasets, ouliders_path)

# Exportar los datasets limpios (sin outliers) a CSV
export_clean_data(data, outliers_datasets, data_path)


--- Panel de Información de Outliers ---

Método: IQR
Número de outliers detectados: 766
Resumen de outliers:

Método: Z-Score
Número de outliers detectados: 128
Resumen de outliers:

Método: Isolation Forest
Número de outliers detectados: 94
Resumen de outliers:

Método: DBSCAN
Número de outliers detectados: 765
Resumen de outliers:

Método: Mahalanobis
Número de outliers detectados: 1
Resumen de outliers:
Archivo exportado: outliers_results/outliders/IQR_outliers.csv
Archivo exportado: outliers_results/outliders/Z-Score_outliers.csv
Archivo exportado: outliers_results/outliders/Isolation Forest_outliers.csv
Archivo exportado: outliers_results/outliders/DBSCAN_outliers.csv
Archivo exportado: outliers_results/outliders/Mahalanobis_outliers.csv


In [20]:
# df.to_csv("./temp/final.csv")