<a href="https://colab.research.google.com/github/rtrochepy/astronomer/blob/main/tst_data_v0001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install tqdm
# !pip install imbalanced-learn



In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from joblib import dump, load
from scipy.stats import skew, kurtosis
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN
import os
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import numpy as np

In [4]:
# Ignorar advertencias de pandas para una ejecución más limpia
warnings.filterwarnings('ignore')

# Configuración de pandas para mostrar todas las columnas al imprimir
pd.set_option('display.max_columns', None)

In [5]:
# Lee el archivo CSV.  Error handling mejorado.
try:
    df = pd.read_csv("data_labels.csv")
except FileNotFoundError:
    print("Error: El archivo 'data_labels.csv' no se encuentra.")
except pd.errors.EmptyDataError:
    print("Error: El archivo 'data_labels.csv' está vacío.")
except pd.errors.ParserError:
    print("Error: Error al analizar el archivo 'data_labels.csv'.")

In [6]:
# ver cuantas filas y columnas tiene (90009 filas, 191 columnas)
df.shape

(90009, 191)

In [7]:
print(f"Filas cargadas: {len(df)}")

Filas cargadas: 90009


In [8]:
def handle_missing_values(df, threshold=0.5):
    # Elimina columnas con más del umbral de valores nulos
    df = df.dropna(thresh=threshold * len(df), axis=1)

    # Identificar columnas numéricas
    numerical_columns = df.select_dtypes(include=['number']).columns
    # Calcular porcentaje de nulos
    null_percentage = df[numerical_columns].isnull().mean() * 100

    # Clasificar columnas por porcentaje de nulos
    low_null = null_percentage[null_percentage < 10].index
    mid_null = null_percentage[(null_percentage >= 10) & (null_percentage < 30)].index
    high_null = null_percentage[null_percentage >= 30].index

    # Relleno de nulos
    df[low_null] = df[low_null].fillna(df[low_null].mean())
    df[mid_null] = df[mid_null].fillna(df[mid_null].median())
    if high_null.any():
        imputer = IterativeImputer()
        df[high_null] = imputer.fit_transform(df[high_null])

    return df

df = handle_missing_values(df)


In [9]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def binarize_levels_with_missing(value):
    # Ejemplo de lógica para binarización
    if value == 'Missing':
        return 0
    else:
        return 1

def process_categorical(df, categorical_columns, binary_columns):
    # Rellenar nulos con 'Missing' y agregar 'Missing' como categoría si no existe
    for col in categorical_columns:
        if df[col].dtype.name == 'category':
            if 'Missing' not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(['Missing'])
            df[col] = df[col].fillna('Missing')
        else:
            df[col] = df[col].fillna('Missing').astype('category')

    # Aplicar binning personalizado si aplica
    for col in ['Infraction_CLH', 'Base_67254', 'Infraction_TEN']:
        if col in df.columns:
            df[col] = df[col].apply(binarize_levels_with_missing)

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=False)

    # Label Encoding en columnas binarias
    le = LabelEncoder()
    for col in binary_columns:
        if col in df.columns:
            df[col] = le.fit_transform(df[col].astype(str))

    return df

# Ejemplo de uso (asegúrate de tener un DataFrame válido en df)
categorical_columns = ['Infraction_YFSG', 'Infraction_DQLY', 'Infraction_CLH',
                       'Base_67254', 'Infraction_TEN', 'Base_8730', 'Base_23737',
                       'Infraction_NMCB', 'Infraction_ZRH', 'Infraction_WIS', 'Infraction_WMAQ']
binary_columns = ['Base_23737', 'Infraction_NMCB', 'Infraction_ZRH', 'Infraction_WIS']

# Ejecuta la función en tu DataFrame
df = process_categorical(df, categorical_columns, binary_columns)

In [10]:
def handle_outliers(df, numeric_columns, iqr_multiplier=1.5):
    Q1 = df[numeric_columns].quantile(0.25)
    Q3 = df[numeric_columns].quantile(0.75)
    IQR = Q3 - Q1

    # Filtrar y manejar outliers
    for col in numeric_columns:
        lower_limit = Q1[col] - iqr_multiplier * IQR[col]
        upper_limit = Q3[col] + iqr_multiplier * IQR[col]
        df[col] = np.clip(df[col], lower_limit, upper_limit)

    return df

numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df = handle_outliers(df, numeric_columns)

In [12]:
# Convirtiendo la columna 'fecha' a datetime
df['Expenditure_AHF'] = pd.to_datetime(df['Expenditure_AHF'])

In [None]:
# Conversión de columnas no numéricas, ejemplo para fechas
# df['Expenditure_AHF'] = pd.to_datetime(df['Expenditure_AHF'])
# df['Expenditure_AHF'] = (df['Expenditure_AHF'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN
import pandas as pd
import numpy as np

def balance_data(X, y, method='ADASYN'):
    if method == 'ADASYN':
        sampler = ADASYN(random_state=42)
    elif method == 'SMOTEENN':
        sampler = SMOTEENN(random_state=42)
    else:
        raise ValueError("Método de balanceo no soportado.")

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled

# Verificar y convertir columnas no numéricas
def preprocess_dataframe(df):
    # Convertir fechas a números (días desde el 01-01-1970)
    for col in df.select_dtypes(include=['datetime64']).columns:
        df[col] = (df[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')

    # Aplicar One-Hot Encoding para columnas categóricas
    df = pd.get_dummies(df, drop_first=True)

    # Convertir todas las columnas a tipo float
    df = df.astype(float)

    # Verificar que todas las columnas sean numéricas
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
    if len(non_numeric_cols) > 0:
        raise ValueError(f"Existen columnas no numéricas: {list(non_numeric_cols)}")

    return df

# Simulación: conversión del DataFrame
df['Expenditure_AHF'] = pd.to_datetime(df['Expenditure_AHF'])
df = preprocess_dataframe(df)

# Separar características y etiquetas
X = df.drop(columns=['label'])
y = df['label']

# División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balancear los datos de entrenamiento
X_train_balanced, y_train_balanced = balance_data(X_train, y_train, method='ADASYN')