In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest

#### Detección de valores atípicos

In [2]:
df_data = pd.read_csv('process_dataset/02_second_result.csv')

In [3]:
def detect_outliers_iqr(df):
    outliers_list = []
    
    for column in df.select_dtypes(include=['float64', 'int64']):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
        
        for outlier in outliers:
            outliers_list.append({'Column': column, 'Outlier Value': outlier})
    
    outliers_df = pd.DataFrame(outliers_list)
    
    return outliers_df

In [4]:
def filter_isolation_forest(df):
    df_numeric = df.select_dtypes(include=['float64', 'int64'])
    df_categorical = df.select_dtypes(include=['object'])

    model = IsolationForest(contamination=0.1, random_state=42)
    model.fit(df_numeric)
    
    # Hacer predicción de outliers (-1 para outliers, 1 para normales)
    df_numeric['outlier'] = model.predict(df_numeric)

    df_concat = pd.concat([df_numeric, df_categorical], axis=1)

    return df_concat


##### Exploración de outliers con método local

In [5]:
outliers_detected = detect_outliers_iqr(df_data) # Detectar outliers en las columnas numéricas
outliers_detected

No se identifican outliers con IQR

#### Detección de outliers con método global

In [6]:
df_outliers = filter_isolation_forest(df_data)

In [7]:
outliers = df_outliers[df_outliers['outlier'] == -1]
outliers

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,...,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic,Diastolic,outlier,Sex,Diet,Continent
2,85,201,105,0,0,1,1,12.466069,1,1,...,382,1,10,1,107,96,-1,Male,Unhealthy,South America
37,84,296,45,1,1,0,1,18.625656,0,1,...,507,2,6,0,92,62,-1,Female,Healthy,Asia
60,83,218,47,0,0,1,0,0.411055,0,0,...,762,7,9,1,101,86,-1,Female,Average,Africa
63,29,135,53,0,1,1,0,4.784619,0,1,...,84,0,5,1,164,97,-1,Female,Average,Europe
64,25,197,45,0,1,0,1,18.848526,1,1,...,661,0,6,0,178,72,-1,Male,Unhealthy,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6204,42,395,109,1,0,0,0,4.927777,0,1,...,605,7,4,1,109,109,-1,Male,Unhealthy,Asia
6214,23,184,96,0,1,1,1,17.825916,0,1,...,64,0,5,1,149,106,-1,Female,Healthy,South America
6239,32,385,48,0,0,0,0,10.888745,0,1,...,772,3,9,1,112,88,-1,Male,Average,Africa
6241,73,384,46,0,0,1,0,0.429594,0,1,...,102,1,8,1,142,66,-1,Female,Unhealthy,South America


In [8]:
df_outliers['outlier'].value_counts()

outlier
 1    5650
-1     628
Name: count, dtype: int64

In [9]:
df_clean = df_outliers[df_outliers['outlier'] != -1]
df_clean = df_clean.drop('outlier', axis=1)

In [10]:
df_clean

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic,Diastolic,Sex,Diet,Continent
0,19,143,51,1,0,0,0,16.978907,0,1,...,39.348247,208,7,8,0,138,85,Female,Unhealthy,Europe
1,33,170,46,1,1,0,1,2.210484,0,0,...,33.982475,680,1,9,0,166,101,Male,Average,Europe
3,37,150,87,0,0,0,1,7.893431,1,1,...,34.952897,359,0,4,0,111,67,Female,Healthy,Asia
4,18,233,74,0,1,0,1,18.571328,1,1,...,39.428916,704,4,5,0,136,93,Female,Healthy,Asia
5,35,376,88,0,0,1,1,6.229191,0,0,...,18.776165,200,5,9,0,146,82,Male,Unhealthy,South America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6273,80,151,108,1,0,1,0,1.617427,1,0,...,36.497506,651,0,7,0,160,74,Male,Healthy,South America
6274,27,396,79,1,0,0,0,4.601646,0,1,...,21.534604,754,0,10,0,129,108,Female,Average,South America
6275,44,339,85,1,0,1,1,14.725646,0,0,...,21.402346,269,1,8,0,177,94,Female,Healthy,Europe
6276,70,123,76,0,1,0,1,2.800186,1,0,...,22.434137,540,2,8,0,157,98,Male,Average,Australia


In [11]:
df_clean.to_csv('process_dataset/03_third_result.csv', index=False)