<a href="https://colab.research.google.com/github/rtrochepy/astronomer/blob/main/best_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from joblib import dump, load
from scipy.stats import skew, kurtosis
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN
import os
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import numpy as np

In [4]:
# Ignorar advertencias de pandas para una ejecución más limpia
warnings.filterwarnings('ignore')

# Configuración de pandas para mostrar todas las columnas al imprimir
pd.set_option('display.max_columns', None)

In [5]:
# Lee el archivo CSV.  Error handling mejorado.
try:
    df = pd.read_csv("test_data.csv")
except FileNotFoundError:
    print("Error: El archivo 'data_labels.csv' no se encuentra.")
except pd.errors.EmptyDataError:
    print("Error: El archivo 'data_labels.csv' está vacío.")
except pd.errors.ParserError:
    print("Error: Error al analizar el archivo 'data_labels.csv'.")

In [6]:
# Ver los nulos que hay en cada columna
nulos_por_columna = df.isnull().sum()

# Filtrar solo las columnas que tienen valores nulos
nulos_por_columna = nulos_por_columna[nulos_por_columna > 0]
nulos_por_columna

Unnamed: 0,0
Payment_6804,399
Base_80863,15
Expenditure_JIG,11632
Infraction_SNZ,15
Base_02683,15
...,...
Infraction_QGR,968
Infraction_ZTLC,49935
Infraction_LSX,968
Infraction_IBJ,413


In [7]:
# Calcula el umbral para filtrar las columnas
umbral = 0.5 * df.shape[0]

# Elimina las columnas con más del 50% de valores faltantes
df = df.dropna(thresh=umbral, axis=1)
df

Unnamed: 0,ID,Expenditure_AHF,Payment_6804,Infraction_CGP,Base_7744,Base_80863,Risk_1930,Expenditure_JIG,Infraction_SNZ,Base_02683,Infraction_ZWWJ,Infraction_QJJF,Base_76065,Infraction_EJZ,Base_6872,Risk_0322,Infraction_FMXQ,Infraction_GGO,Infraction_TLPJ,Base_1165,Base_39598,Base_6187,Infraction_ZTNC,Base_85131,Risk_9995,Infraction_AYWV,Payment_22507,Base_9516,Expenditure_YTR,Base_36384,Expenditure_FIP,Infraction_PAS,Risk_0003,Expenditure_HMO,Base_24406,Expenditure_LMSR,Infraction_BSU,Base_14808,Risk_8065,Infraction_ZYW,Base_1039,Infraction_HSSU,Infraction_EHZP,Infraction_TBP,Base_0580,Expenditure_RGD,Infraction_PBC,Infraction_YFSG,Infraction_DQLY,Infraction_AQO,Base_0229,Base_69608,Base_91828,Base_6852,Infraction_CLH,Expenditure_IDZ,Risk_1475,Expenditure_BWX,Base_8511,Infraction_JYZB,Base_22178,Infraction_ZTYG,Infraction_ZVW,Infraction_EYU,Expenditure_UWVG,Base_3041,Payment_3207,Infraction_QKZN,Infraction_CZE,Base_65352,Risk_7095,Infraction_JBR,Base_66195,Base_36516,Infraction_RXQH,Infraction_HFU,Risk_6346,Expenditure_HRQ,Infraction_VTR,Risk_2102,Risk_4804,Base_7331,Infraction_XWX,Expenditure_XDD,Risk_4553,Base_67585,Risk_8742,Infraction_VHU,Risk_4247,Risk_2380,Infraction_GSS,Risk_0454,Base_8730,Expenditure_HKXV,Infraction_MHM,Risk_4160,Risk_3506,Base_23737,Expenditure_GCAO,Risk_9367,Base_7910,Expenditure_GMC,Risk_9423,Risk_6977,Base_9103,Infraction_KSBR,Risk_6178,Risk_6197,Infraction_NRBQ,Infraction_WVC,Infraction_QVSL,Infraction_QXUM,Risk_8532,Risk_9247,Infraction_IMIM,Expenditure_UIWS,Expenditure_ONEG,Expenditure_MTRQ,Expenditure_LAHK,Expenditure_HPM,Infraction_LTIS,Infraction_HFSI,Infraction_ETH,Infraction_SDWM,Base_5441,Base_2810,Risk_8902,Base_67254,Infraction_PTY,Infraction_BGGU,Base_4569,Expenditure_BEH,Infraction_LMHK,Infraction_NMCB,Infraction_TPAF,Infraction_ZRH,Infraction_TEN,Infraction_XEPQ,Infraction_ZMKI,Infraction_WIS,Infraction_RKTA,Infraction_IIZ,Infraction_WVAW,Infraction_KEJT,Infraction_TFOY,Infraction_WMAQ,Infraction_SIA,Infraction_CZXL,Infraction_QEY,Base_52892,Infraction_HUK,Infraction_VHHP,Infraction_LIES,Risk_5270,Infraction_QWWW,Infraction_YQXM,Infraction_QGR,Infraction_LSX,Infraction_IBJ,Infraction_DNOU
0,3333333702723732807074333342320276353770732523...,2017-03-15,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,0.000630,0.080986,0.708906,0.170600,0.006204,0.358587,0.525351,0.255736,0.063902,0.059416,0.006466,1.335856,0.008207,0.001423,0.207334,0.736463,0.096219,0.023381,0.002768,0.008322,1.001519,0.008298,0.161345,0.148266,0.922998,0.354596,0.118075,0.001882,0.158612,0.018385,0.063646,0.199617,0.308233,0.016361,0.401619,0.091071,CR,O,0.007126,0.007665,0.652984,0.008520,0.004730,very_high,0.272008,0.008363,0.515222,0.002644,0.009013,0.004808,0.008342,0.119403,0.004802,0.108271,0.050882,0.007554,0.080422,0.069067,0.004327,0.007562,,0.007729,0.000272,0.001576,0.004239,0.001434,0.002271,0.004061,0.007121,0.002456,0.002310,0.003532,0.008033,1.009825,0.084683,0.003820,0.007043,0.000438,0.006452,0.000830,0.005055,0.0,0.005720,0.007084,0.000198,0.008907,1,0.002537,0.005177,0.006626,0.009705,0.007782,0.002450,1.001101,0.002665,0.007479,0.006893,1.503673,1.006133,0.003569,0.008871,0.003950,0.003647,0.004950,0.894090,0.135561,0.911191,0.974539,0.001243,0.766688,1.008691,1.004587,0.670041,0.009968,0.004572,1.008949,moderate_low,0.004326,1.007336,0.210060,0.676922,0.007871,1.0,0.238250,0.0,moderate_high,0.232120,0.236266,0.0,0.702280,0.434345,0.003057,0.686516,0.008740,1.0,1.003319,1.007819,1.000080,0.006805,0.002052,0.005972,0.004345,0.001535,0.002427,0.003706,0.003818,0.000569,0.000610,0.002674
1,3333333702723732807074333342320276353770732523...,2017-04-13,0.936665,0.005775,0.004923,1.000653,0.006151,0.126750,0.000798,0.002714,,0.002526,0.069419,0.712795,0.113239,0.006206,0.353630,0.521311,0.223329,0.065261,0.057744,0.001614,1.339794,0.008373,0.001984,0.202778,0.720886,0.099804,0.030599,0.002749,0.002482,1.009033,0.005136,0.140951,0.143530,0.919414,0.326757,0.118737,0.001610,0.148459,0.013035,0.065501,0.151387,0.265026,0.017688,0.406326,0.086805,CR,O,0.002413,0.007148,0.647093,0.002238,0.003879,very_high,0.188970,0.004030,0.509048,0.004193,0.007842,0.001283,0.006524,0.140611,0.000094,0.101018,0.040469,0.004832,0.081413,0.074166,0.004203,0.005304,,0.001864,0.000979,0.009896,0.007597,0.000509,0.009810,0.000127,0.005966,0.000395,0.001327,0.007773,0.000760,1.009461,0.081843,0.000347,0.007789,0.004311,0.002332,0.009469,0.003753,0.0,0.007584,0.006677,0.001142,0.005907,1,0.008427,0.008979,0.001854,0.009924,0.005987,0.002247,1.006779,0.002508,0.006827,0.002837,1.503577,1.005791,0.000571,0.000391,0.008351,0.008850,0.003180,0.902135,0.136333,0.919876,0.975624,0.004561,0.786007,1.000084,1.004118,0.668647,0.003921,0.004654,1.003205,moderate_low,0.008707,1.007653,0.184093,0.822281,0.003444,1.0,0.247217,0.0,moderate_high,0.243532,0.241885,0.0,0.707017,0.430501,0.001306,0.686414,0.000755,1.0,1.008394,1.004333,1.008344,0.004407,0.001034,0.004838,0.007495,0.004931,0.003954,0.003167,0.005032,0.009576,0.005492,0.009217
2,3333333702723732807074333342320276353770732523...,2017-06-03,0.954180,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,0.007605,0.068839,0.720884,0.060492,0.003259,0.334650,0.524568,0.189424,0.066982,0.056647,0.005126,1.337179,0.009355,0.007426,0.206629,0.738044,0.134073,0.048367,0.010077,0.000530,1.009184,0.006961,0.112229,0.137014,1.001977,0.304124,0.114534,0.006328,0.139504,0.056653,0.070607,0.305883,0.212165,0.063955,0.406768,0.094001,CR,O,0.001878,0.003636,0.645819,0.000408,0.004578,very_high,0.495308,0.006838,0.679257,0.001337,0.006025,0.009393,0.002615,0.075868,0.007152,0.103239,0.047454,0.006561,0.078891,0.076510,0.001782,0.001422,,0.005419,0.006149,0.009629,0.003094,0.008295,0.009362,0.000954,0.005447,0.007345,0.007624,0.008811,0.004056,1.004291,0.081954,0.002709,0.004093,0.007139,0.008358,0.002325,0.007381,0.0,0.005901,0.001185,0.008013,0.008882,1,0.007327,0.002016,0.008686,0.008446,0.007291,0.007794,1.001014,0.009634,0.009820,0.005080,1.503359,1.005801,0.007425,0.009234,0.002471,0.009769,0.005433,0.939654,0.134938,0.958699,0.974067,0.011736,0.806840,1.003014,1.009285,0.670901,0.001264,0.019176,1.000754,moderate_low,0.004092,1.004312,0.154837,0.853498,0.003269,1.0,0.239867,0.0,moderate_high,0.240768,0.239710,0.0,0.704843,0.434409,0.003954,0.690101,0.009617,1.0,1.009307,1.007831,1.006878,0.003221,0.005681,0.005497,0.009227,0.009123,0.003269,0.007329,0.000427,0.003429,0.006986,0.002603
3,3333333702723732807074333342320276353770732523...,2017-06-19,0.960384,0.002455,0.013683,1.002700,0.001373,0.117169,0.000685,0.005531,,0.006406,0.055630,0.723997,0.166782,0.009918,0.323271,0.530929,0.135586,0.083720,0.049253,0.001418,1.339909,0.006782,0.003515,0.208214,0.741813,0.134437,0.030063,0.009667,0.000783,1.007456,0.008706,0.102838,0.129017,0.704016,0.275055,0.120740,0.004980,0.138100,0.012498,0.065926,0.273553,0.204300,0.022732,0.405175,0.094854,CR,O,0.005899,0.005896,0.654358,0.005897,0.005207,very_high,0.508670,0.008183,0.515282,0.008716,0.005271,0.004554,0.002052,0.150209,0.005364,0.206394,0.031705,0.009559,0.077490,0.071547,0.005595,0.006363,,0.000646,0.009193,0.008568,0.003895,0.005153,0.004876,0.005665,0.001888,0.004961,0.000034,0.004652,0.006969,1.004728,0.060634,0.009982,0.008817,0.008690,0.007364,0.005924,0.008802,0.0,0.002520,0.003324,0.009455,0.008348,1,0.007053,0.003909,0.002478,0.006614,0.009977,0.007686,1.002775,0.007791,0.000458,0.007320,1.503701,1.007036,0.000664,0.003200,0.008507,0.004858,0.000063,0.913205,0.140058,0.926341,0.975499,0.007571,0.808214,1.001517,1.004514,0.672620,0.002729,0.011720,1.005338,moderate_low,0.009703,1.002538,0.153939,0.844667,0.000053,1.0,0.240910,0.0,moderate_high,0.239400,0.240727,0.0,0.711546,0.436903,0.005135,0.687779,0.004649,1.0,1.001671,1.003460,1.007573,0.007703,0.007108,0.008261,0.007206,0.002409,0.006117,0.004516,0.003200,0.008419,0.006527,0.009600
4,3333333702723732807074333342320276353770732523...,2017-07-22,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,,0.007731,0.038862,0.720619,0.143630,0.006667,0.231009,0.529305,,0.075900,0.048918,0.001199,1.341735,0.000519,0.001362,0.205468,0.691986,0.121518,0.054221,0.009484,0.006698,1.003738,0.003846,0.094311,0.129539,0.917133,0.231110,0.095178,0.001653,0.126443,0.027897,0.063697,0.233103,0.175655,0.031171,0.487460,0.093915,CR,O,0.009479,0.001714,0.650112,0.007773,0.005851,very_high,0.216507,0.008605,0.507712,0.006821,0.000152,0.000104,0.001419,0.096441,0.007972,0.106020,0.032733,0.008156,0.076561,0.074432,0.004933,0.004831,,0.001833,0.005738,0.003289,0.002608,0.007338,0.007447,0.004465,0.006111,0.002246,0.002109,0.001141,0.001770,1.000904,0.062492,0.005860,0.001845,0.007816,0.002470,0.005516,0.007166,0.0,0.000155,0.001504,0.002019,0.002678,1,0.007728,0.003432,0.002199,0.005511,0.004105,0.009656,1.006536,0.005158,0.003341,0.000264,1.509905,1.002915,0.003079,0.003845,0.007190,0.002983,0.000535,0.921026,0.131620,0.933479,0.978027,0.018200,0.822281,1.006125,1.005735,0.673869,0.009998,0.017598,1.003175,moderate_low,0.009120,1.000130,0.120717,0.811199,0.008724,1.0,0.247939,0.0,moderate_high,0.244199,0.242325,0.0,0.705343,0.437433,0.002849,0.688774,0.000097,1.0,1.009886,1.005053,1.008132,0.009823,0.009680,0.004848,0.006312,0.004462,0.003671,0.004946,0.008889,0.001670,0.008126,0.009827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60302,3783237304213421825500285154044276700225673081...,2017-06-04,0.474935,0.714648,0.369253,0.030189,0.503984,0.851015,0.000718,0.748423,,0.250632,0.682027,0.343908,0.008157,0.004784,0.495887,0.259054,0.948187,0.007171,0.750172,0.009440,0.007699,0.499957,0.001714,0.235678,0.477886,0.012977,0.009380,0.310833,0.007718,1.004425,0.008603,0.823850,0.011168,0.322438,1.031176,0.025254,0.001414,0.359074,0.088341,0.153453,0.089544,0.896870,0.003062,0.404575,0.059649,CO,O,0.000112,0.923268,0.060214,0.584377,0.648313,very_high,0.236340,0.009736,0.425406,0.005034,0.008998,0.502254,0.009565,0.006832,0.003747,0.400495,0.769012,0.006920,0.148841,0.207334,0.004680,0.001612,,0.149349,0.002683,0.007456,0.004370,0.004445,0.007332,0.003165,1.002135,0.009908,0.003870,0.006893,0.002426,1.004573,0.455602,0.000437,0.005272,0.000079,0.000213,0.004550,0.007363,1.0,0.000718,0.003619,0.000104,0.003490,1,0.000834,0.005360,0.005567,0.002130,0.006965,0.000362,0.004889,0.003057,0.006359,0.001983,0.004832,0.002955,0.006221,0.004748,0.002687,0.001953,0.003173,0.962336,0.137261,0.979745,0.978726,0.007347,0.007100,0.009662,0.005869,0.009498,0.009593,0.366810,1.004141,moderate_high,0.005141,1.009069,0.668292,0.008575,0.409409,1.0,0.214571,0.0,moderate,0.215012,0.216811,0.0,0.508296,0.431020,0.006136,0.276567,0.008882,1.0,0.007614,0.009926,0.004699,0.008248,0.004773,0.001012,0.008922,0.004772,0.002925,0.004995,0.008872,0.001051,0.002803,0.005814
60303,3783237304213421825500285154044276700225673081...,2017-07-19,0.446629,0.242127,0.413505,0.038282,0.007281,0.825581,0.006543,0.798725,,0.259411,0.692671,0.352515,0.009392,0.001280,0.495207,0.260707,0.960662,0.002268,0.764383,0.001357,0.009985,0.566028,0.008344,0.232064,0.459457,0.006094,0.017258,0.357464,0.009644,1.008895,0.004242,0.819209,0.010047,0.408021,1.038892,0.037218,0.009856,0.355411,0.098821,0.148909,0.091198,0.892304,0.002113,0.409280,0.063987,CO,O,0.005730,1.007259,0.051371,0.754560,0.769865,very_high,0.480513,0.004761,0.288886,0.004913,0.008847,0.503873,0.009661,0.009791,0.003622,0.302540,0.781021,0.006814,0.149264,0.202619,0.000859,0.000369,,0.166730,0.000452,0.500304,0.002558,0.008921,0.000452,0.000352,0.008360,0.000946,0.004809,0.002052,0.003394,1.008964,0.461704,0.009234,0.002245,0.007282,0.003493,0.005577,0.000959,1.0,0.008926,0.004160,0.005298,0.009389,1,0.004060,0.000924,0.008587,0.007542,0.000565,0.001006,0.003870,0.008875,0.001920,0.006998,0.003033,0.007064,0.006947,0.004993,0.003142,0.003037,0.009487,0.959025,0.137014,0.979791,0.973427,0.005159,0.003845,0.008357,0.006123,0.001299,0.000624,0.414066,1.003305,moderate_high,0.003581,1.000786,0.670128,0.434335,0.403654,1.0,0.217651,0.0,moderate,0.208577,0.211673,0.0,0.510000,0.429681,0.006079,0.276017,0.007627,1.0,0.001789,0.008281,0.001839,0.009570,0.000479,0.009532,0.004581,0.003448,0.008777,0.001497,0.002389,0.000539,0.009607,0.008875
60304,3783237304213421825500285154044276700225673081...,2017-08-19,0.449650,0.239224,0.408919,0.038281,0.003831,0.655708,0.007444,0.799327,,0.258284,0.709453,0.353389,0.017128,0.000993,0.479717,0.264574,0.956770,0.001622,0.759853,0.003874,0.008879,0.558984,0.005083,0.230331,0.493024,0.012092,0.017284,0.352264,0.003431,1.004062,0.004613,0.685461,0.010935,0.323863,1.039726,0.037066,0.001246,0.360742,0.095927,0.155390,0.096682,0.912929,0.006391,0.686118,0.099241,CO,O,0.000979,1.000363,0.056275,0.831354,0.829201,very_high,-0.212266,0.006986,0.005955,0.005160,0.009367,0.508916,0.003330,0.004622,0.002073,0.302490,0.778700,0.009749,0.146248,0.209838,0.003201,0.004858,,0.165528,0.005895,0.507756,0.006296,0.007407,0.004433,0.008035,0.007923,0.009169,0.008121,0.009252,0.007811,1.008685,0.470758,0.008323,0.007020,0.003630,0.003813,0.009278,0.003717,1.0,0.001956,0.001136,0.002396,0.004241,1,0.005561,0.000921,0.007693,0.007022,0.008921,0.005861,0.004322,0.004498,0.009865,0.000742,0.007605,0.003957,0.000327,0.009888,0.006371,0.004016,0.004300,0.954512,0.133088,0.976056,0.972786,0.007856,0.000469,0.008300,0.007882,0.001959,0.006317,0.410925,1.005187,moderate_high,0.003456,1.004859,0.681277,0.426626,0.400699,1.0,0.220117,0.0,moderate,0.213127,0.216538,0.0,0.510946,0.435091,0.006660,0.274458,0.008838,1.0,0.009365,0.001182,0.007035,0.006845,0.009845,0.001864,0.002008,0.004138,0.003547,0.007564,0.000163,0.007565,0.003775,0.006874
60305,3783237304213421825500285154044276700225673081...,2017-09-11,0.446463,0.065974,0.422754,0.020252,0.009037,0.612380,0.008637,0.822657,,0.250489,0.713661,0.353214,0.011223,0.006898,0.486873,0.256793,0.955138,0.006157,0.756831,0.009551,0.006604,0.577250,0.006616,0.232166,0.489287,0.014393,0.019545,0.373437,0.000878,1.003493,0.006944,0.638441,0.012200,0.253601,1.044600,0.036709,0.009611,0.356685,0.094985,0.148884,0.097756,0.909672,0.008833,0.569264,0.103870,CO,O,0.008898,1.001165,0.052455,0.914858,0.890583,very_high,0.067435,0.001687,0.000356,0.000436,0.002929,0.507551,0.007811,0.002069,0.009647,0.409215,0.782922,0.008776,0.151370,0.200999,0.004602,0.000705,,0.168042,0.004297,0.509371,0.003906,0.009252,0.002358,0.008415,0.006221,0.004555,0.008411,0.009804,0.001351,1.003778,0.473997,0.002219,0.002116,0.007375,0.009939,0.007957,0.005086,1.0,0.003886,0.008231,0.008310,0.001901,1,0.004129,0.007098,0.007158,0.007434,0.002747,0.005617,0.001803,0.005560,0.000979,0.006108,0.000542,0.003364,0.004620,0.009266,0.009051,0.009577,0.008556,0.959926,0.134014,0.977062,0.971149,0.005288,0.007383,0.000168,0.001125,0.007665,0.007309,0.419637,1.003767,moderate_high,0.005077,1.003094,0.673936,0.008824,0.404136,1.0,0.222673,0.0,moderate,0.219841,0.211343,0.0,0.515970,0.428649,0.003030,0.278438,0.007545,1.0,0.006224,0.006888,0.004786,0.006243,0.000948,0.007854,0.007782,0.008938,0.009385,0.000216,0.007786,0.000766,0.008304,0.000823


In [8]:
# 1. Definir columnas no numéricas y columnas numéricas
no_numerical = ['ID', 'Expenditure_AHF', 'Infraction_YFSG', 'Infraction_DQLY', 'Infraction_CLH',
                'Base_67254', 'Infraction_TEN', 'Base_8730', 'Base_23737', 'Infraction_NMCB',
                'Infraction_ZRH', 'Infraction_WIS', 'Infraction_WMAQ', 'label']

# Seleccionar solo las columnas numéricas
numerical_columns = df.select_dtypes(include=['number']).columns
numerical_columns = [col for col in numerical_columns if col not in no_numerical]

# Crear una copia del DataFrame para evitar advertencias
df_clean = df.copy()

# 2. Calcular el porcentaje de valores nulos en columnas numéricas
porcentaje_null = df_clean[numerical_columns].isnull().mean() * 100

# 3. Clasificar columnas según el porcentaje de valores nulos
low_null_columns = porcentaje_null[porcentaje_null < 10].index
mid_null_columns = porcentaje_null[(porcentaje_null >= 10) & (porcentaje_null < 30)].index
high_null_columns = porcentaje_null[porcentaje_null >= 30].index

# Mostrar un informe
print("Resumen del Tratamiento de Valores Nulos:")
print(f"- Columnas con menos del 10% de nulos: {len(low_null_columns)} columnas.")
print(f"- Columnas con entre 10% y 30% de nulos: {len(mid_null_columns)} columnas.")
print(f"- Columnas con más del 30% de nulos: {len(high_null_columns)} columnas.\n")

# 4. Tratamiento de Valores Nulos
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Menos del 10% de nulos - rellenar con la media
if len(low_null_columns) > 0:
    df_clean[low_null_columns] = df_clean[low_null_columns].fillna(df_clean[low_null_columns].mean())
    print("✔ Columnas con menos del 10% de nulos tratadas con la media.")

# Entre 10% y 30% de nulos - rellenar con la mediana
if len(mid_null_columns) > 0:
    df_clean[mid_null_columns] = df_clean[mid_null_columns].fillna(df_clean[mid_null_columns].median())
    print("✔ Columnas con entre 10% y 30% de nulos tratadas con la mediana.")

# Más del 30% de nulos - usar IterativeImputer
if len(high_null_columns) > 0:
    imputer = IterativeImputer(random_state=42)
    df_clean[high_null_columns] = imputer.fit_transform(df_clean[high_null_columns])
    print("✔ Columnas con más del 30% de nulos tratadas con Iterative Imputer.")

# Verificación final de valores nulos
print("\nValores nulos después del tratamiento:")
print(df_clean[numerical_columns].isnull().sum().sort_values(ascending=False))

Resumen del Tratamiento de Valores Nulos:
- Columnas con menos del 10% de nulos: 138 columnas.
- Columnas con entre 10% y 30% de nulos: 8 columnas.
- Columnas con más del 30% de nulos: 1 columnas.

✔ Columnas con menos del 10% de nulos tratadas con la media.
✔ Columnas con entre 10% y 30% de nulos tratadas con la mediana.
✔ Columnas con más del 30% de nulos tratadas con Iterative Imputer.

Valores nulos después del tratamiento:
Payment_6804        0
Expenditure_MTRQ    0
Expenditure_GMC     0
Risk_9423           0
Risk_6977           0
                   ..
Base_6852           0
Expenditure_IDZ     0
Risk_1475           0
Expenditure_BWX     0
Infraction_DNOU     0
Length: 147, dtype: int64


In [9]:
# 1. Variables categóricas
categoricas = ['Infraction_YFSG', 'Infraction_DQLY', 'Infraction_CLH', 'Base_67254',
               'Infraction_TEN', 'Base_8730', 'Base_23737', 'Infraction_NMCB',
               'Infraction_ZRH', 'Infraction_WIS', 'Infraction_WMAQ']

# 2. Mostrar categorías únicas para cada variable categórica
print("Categorías únicas antes de rellenar valores faltantes:")
for col in categoricas:
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")
    else:
        print(f"Advertencia: La columna '{col}' no existe en el DataFrame.")

# 3. Rellenar valores faltantes con 'Missing'
for col in categoricas:
    if col in df.columns:
        df[col].fillna('Missing', inplace=True)
        print(f"✔ Valores faltantes en '{col}' reemplazados con 'Missing'.")
    else:
        print(f"❌ La columna '{col}' no fue encontrada.")

# 4. Comprobar que los valores faltantes se han rellenado
print("\nCategorías únicas después de rellenar valores faltantes:")
for col in categoricas:
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")

# 5. Convertir columnas categóricas a tipo 'category'
df[categoricas] = df[categoricas].astype('category')
print("\n✔ Columnas categóricas convertidas a tipo 'category' en pandas.")

# 6. Función para aplicar binning con 'Missing'
def binarize_levels_with_missing(value):
    """
    Función para agrupar niveles en categorías más amplias.
    """
    if value == 'Missing':
        return 'Missing'
    elif value in ['very_low', 'low', 'moderate_low', 'extremely_low']:
        return 'low'
    elif value in ['moderate', 'moderate_high']:
        return 'moderate'
    else:
        return 'high'

# 7. Aplicar binning a cada columna categórica
for col in categoricas:
    if col in df.columns:
        df[col + "_binned"] = df[col].apply(binarize_levels_with_missing)
        print(f"✔ Binning aplicado a la columna '{col}', nueva columna: '{col}_binned'.")

# 8. Verificar el resultado final
print("\nEjemplo de binning en las primeras filas:")
for col in categoricas:
    if col in df.columns:
        print(f"\n{col} (original) -> {col}_binned (agrupada):")
        print(df[[col, col + "_binned"]].head())

Categorías únicas antes de rellenar valores faltantes:
Infraction_YFSG: ['CR' 'CO' 'CL' 'XL' 'XM' 'XZ']
Infraction_DQLY: ['O' 'R' nan 'U' '-1']
Infraction_CLH: ['very_high' nan 'moderate_low' 'moderate' 'high' 'moderate_high' 'low'
 'very_low']
Base_67254: ['moderate_low' 'low' 'high' 'moderate' 'moderate_high' 'very_high'
 'extremely_high' nan]
Infraction_TEN: ['moderate_high' 'extremely_low' nan 'moderate_low' 'very_high' 'moderate'
 'high' 'low']
Base_8730: [ 0.  2.  1. nan]
Base_23737: [1 0]
Infraction_NMCB: [ 1.  0. nan]
Infraction_ZRH: [ 0. nan  1.]
Infraction_WIS: [ 0. nan  1.]
Infraction_WMAQ: [ 1. nan  0. -1.]
✔ Valores faltantes en 'Infraction_YFSG' reemplazados con 'Missing'.
✔ Valores faltantes en 'Infraction_DQLY' reemplazados con 'Missing'.
✔ Valores faltantes en 'Infraction_CLH' reemplazados con 'Missing'.
✔ Valores faltantes en 'Base_67254' reemplazados con 'Missing'.
✔ Valores faltantes en 'Infraction_TEN' reemplazados con 'Missing'.
✔ Valores faltantes en 'Base_8730' 

In [10]:
from sklearn.preprocessing import LabelEncoder

# Función para aplicar binning a columnas con orden implícito
def apply_binning(df, columns, binarize_func):
    """
    Aplica una función de binning a columnas seleccionadas.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(binarize_func)
            print(f"✔ Binning aplicado en '{col}'.")
        else:
            print(f"❌ La columna '{col}' no existe.")
    return df

# Función para aplicar One-Hot Encoding
def apply_one_hot_encoding(df, columns):
    """
    Aplica One-Hot Encoding a columnas seleccionadas.
    """
    if any(col in df.columns for col in columns):
        df = pd.get_dummies(df, columns=columns, drop_first=False)
        print(f"✔ One-Hot Encoding aplicado a columnas: {columns}.")
    else:
        print(f"❌ Columnas no encontradas: {columns}.")
    return df

# Función para aplicar Label Encoding a columnas binarias
def apply_label_encoding(df, columns):
    """
    Aplica Label Encoding a columnas binarias después de convertirlas a string.
    """
    le = LabelEncoder()
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
            df[col] = le.fit_transform(df[col])
            print(f"✔ Label Encoding aplicado en '{col}'.")
        else:
            print(f"❌ La columna '{col}' no existe.")
    return df

# Función para verificar valores nulos
def check_missing_values(df):
    """
    Verifica y muestra columnas con valores nulos.
    """
    nulos = df.isnull().sum()
    nulos = nulos[nulos > 0]
    if nulos.empty:
        print("✔ No hay valores nulos en el DataFrame.")
    else:
        print("❌ Columnas con valores nulos:")
        print(nulos)

# Aplicar binning a columnas con orden implícito
columns_with_order = ['Infraction_CLH', 'Base_67254', 'Infraction_TEN']
df = apply_binning(df, columns_with_order, binarize_levels_with_missing)

# Aplicar One-Hot Encoding a columnas seleccionadas
one_hot_columns = {
    'ordered_columns': ['Infraction_CLH', 'Base_67254', 'Infraction_TEN'],
    'unordered_columns': ['Infraction_YFSG', 'Infraction_DQLY'],
    'numeric_low_cardinality': ['Base_8730', 'Infraction_WMAQ']
}

for key, cols in one_hot_columns.items():
    df = apply_one_hot_encoding(df, cols)

# Aplicar Label Encoding a columnas binarias
binary_columns = ['Base_23737', 'Infraction_NMCB', 'Infraction_ZRH', 'Infraction_WIS']
df = apply_label_encoding(df, binary_columns)

# Convertir la columna 'Expenditure_AHF' a datetime
if 'Expenditure_AHF' in df.columns:
    df['Expenditure_AHF'] = pd.to_datetime(df['Expenditure_AHF'], errors='coerce')
    print("✔ Columna 'Expenditure_AHF' convertida a datetime.")
else:
    print("❌ La columna 'Expenditure_AHF' no existe.")

# Establecer la columna 'ID' como índice
if 'ID' in df.columns:
    df.set_index('ID', inplace=True)
    print("✔ Columna 'ID' establecida como índice.")
else:
    print("❌ La columna 'ID' no existe.")

# Verificar nulos después de las transformaciones
print("\nVerificación final de valores nulos:")
check_missing_values(df)

# Mostrar las primeras filas para verificar
print("\nPrimeras filas del DataFrame transformado:")
print(df.head())

✔ Binning aplicado en 'Infraction_CLH'.
✔ Binning aplicado en 'Base_67254'.
✔ Binning aplicado en 'Infraction_TEN'.
✔ One-Hot Encoding aplicado a columnas: ['Infraction_CLH', 'Base_67254', 'Infraction_TEN'].
✔ One-Hot Encoding aplicado a columnas: ['Infraction_YFSG', 'Infraction_DQLY'].
✔ One-Hot Encoding aplicado a columnas: ['Base_8730', 'Infraction_WMAQ'].
✔ Label Encoding aplicado en 'Base_23737'.
✔ Label Encoding aplicado en 'Infraction_NMCB'.
✔ Label Encoding aplicado en 'Infraction_ZRH'.
✔ Label Encoding aplicado en 'Infraction_WIS'.
✔ Columna 'Expenditure_AHF' convertida a datetime.
✔ Columna 'ID' establecida como índice.

Verificación final de valores nulos:
❌ Columnas con valores nulos:
Payment_6804         399
Base_80863            15
Expenditure_JIG    11632
Infraction_SNZ        15
Base_02683            15
                   ...  
Infraction_YQXM      399
Infraction_QGR       968
Infraction_LSX       968
Infraction_IBJ       413
Infraction_DNOU      968
Length: 78, dtype: 

In [11]:
import pandas as pd
import numpy as np
from scipy.stats import skew

# 1. Función para calcular IQR y límites
def calculate_iqr_limits(df, columns):
    """
    Calcula Q1, Q3 e IQR para columnas numéricas.
    Retorna límites inferior y superior para detección de outliers.
    """
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_limits = Q1 - 1.5 * IQR
    upper_limits = Q3 + 1.5 * IQR
    return lower_limits, upper_limits

# 2. Función para identificar y clasificar outliers
def classify_outliers(df, columns, lower_limits, upper_limits):
    """
    Clasifica columnas según porcentaje de outliers: <5%, 5%-15%, >15%.
    """
    outliers = ((df[columns] < lower_limits) | (df[columns] > upper_limits)).sum()
    total_rows = len(df)
    outlier_percentage = (outliers / total_rows) * 100

    # Clasificar columnas
    minor = outlier_percentage[outlier_percentage < 5].index
    moderate = outlier_percentage[(outlier_percentage >= 5) & (outlier_percentage <= 15)].index
    high = outlier_percentage[outlier_percentage > 15].index

    print("\nResumen de Outliers:")
    print(f"✔ Columnas con < 5% de outliers: {list(minor)}")
    print(f"✔ Columnas con 5%-15% outliers: {list(moderate)}")
    print(f"✔ Columnas con > 15% outliers: {list(high)}")
    return minor, moderate, high

# 3. Función para manejar outliers
def handle_outliers(df, minor, moderate, high, lower_limits, upper_limits):
    """
    Elimina, reemplaza o transforma outliers en función de su clasificación.
    """
    # Eliminar outliers en columnas con menos del 5%
    for col in minor:
        df = df[(df[col] >= lower_limits[col]) & (df[col] <= upper_limits[col])]
        print(f"✔ Outliers eliminados en '{col}'.")

    # Reemplazar outliers con límites en columnas moderadas
    for col in moderate:
        df[col] = np.clip(df[col], lower_limits[col], upper_limits[col])
        print(f"✔ Outliers reemplazados con límites en '{col}'.")

    # Transformar columnas con alto número de outliers
    for col in high:
        original_skew = skew(df[col].dropna())
        log_skew = skew(np.log1p(df[col].dropna()))
        sqrt_skew = skew(np.sqrt(df[col].clip(lower=0)))

        if abs(log_skew) < abs(sqrt_skew):
            df[col] = np.log1p(df[col].clip(lower=0))
            print(f"✔ Transformación logarítmica aplicada en '{col}'.")
        else:
            df[col] = np.sqrt(df[col].clip(lower=0))
            print(f"✔ Transformación raíz cuadrada aplicada en '{col}'.")

    return df

# 4. Flujo Principal
# Seleccionar columnas numéricas
numeric_columns = df.select_dtypes(include=np.number).columns

# Calcular límites de IQR
lower_limits, upper_limits = calculate_iqr_limits(df, numeric_columns)

# Clasificar columnas por porcentaje de outliers
minor, moderate, high = classify_outliers(df, numeric_columns, lower_limits, upper_limits)

# Manejar outliers
df_cleaned = handle_outliers(df, minor, moderate, high, lower_limits, upper_limits)

# Verificación final
print("\nVerificación de valores nulos después del tratamiento:")
print(df_cleaned.isnull().sum().sort_values(ascending=False))

# Mostrar primeras filas
print("\nPrimeras filas del DataFrame limpio:")
print(df_cleaned.head())


Resumen de Outliers:
✔ Columnas con < 5% de outliers: ['Payment_6804', 'Base_80863', 'Infraction_EJZ', 'Risk_0322', 'Infraction_GGO', 'Infraction_TLPJ', 'Base_1165', 'Base_6187', 'Infraction_ZTNC', 'Base_85131', 'Infraction_AYWV', 'Base_9516', 'Infraction_PAS', 'Risk_0003', 'Expenditure_HMO', 'Expenditure_LMSR', 'Infraction_BSU', 'Risk_8065', 'Infraction_ZYW', 'Infraction_HSSU', 'Infraction_EHZP', 'Infraction_TBP', 'Infraction_PBC', 'Infraction_AQO', 'Base_0229', 'Base_69608', 'Risk_1475', 'Expenditure_BWX', 'Base_8511', 'Infraction_JYZB', 'Infraction_ZTYG', 'Infraction_EYU', 'Expenditure_UWVG', 'Infraction_CZE', 'Base_65352', 'Risk_7095', 'Infraction_JBR', 'Base_36516', 'Risk_6346', 'Expenditure_HRQ', 'Risk_2102', 'Base_7331', 'Infraction_XWX', 'Risk_4553', 'Risk_8742', 'Infraction_VHU', 'Risk_4247', 'Risk_2380', 'Infraction_GSS', 'Expenditure_HKXV', 'Infraction_MHM', 'Risk_4160', 'Risk_3506', 'Base_23737', 'Expenditure_GCAO', 'Risk_9367', 'Base_7910', 'Expenditure_GMC', 'Risk_9423',

In [12]:
import numpy as np
import pandas as pd

# Función para aplicar transformaciones (raíz cuadrada o logaritmo)
def apply_transformations(df, sqrt_cols, log_cols):
    """
    Aplica transformaciones a columnas seleccionadas:
    - Raíz cuadrada para sqrt_cols
    - Logaritmo natural (log1p) para log_cols
    """
    for col in sqrt_cols:
        if col in df.columns:
            df[col] = np.sqrt(df[col].clip(lower=0))
            print(f"✔ Raíz cuadrada aplicada en '{col}'.")
        else:
            print(f"❌ Columna '{col}' no encontrada para raíz cuadrada.")

    for col in log_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col].clip(lower=0))
            print(f"✔ Logaritmo aplicado en '{col}'.")
        else:
            print(f"❌ Columna '{col}' no encontrada para logaritmo.")
    return df

# Función para convertir columnas categóricas con enteros
def convert_to_category(df, columns):
    """
    Convierte columnas seleccionadas a tipo 'category'.
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
            print(f"✔ Columna '{col}' convertida a categoría.")
        else:
            print(f"❌ Columna '{col}' no encontrada para conversión a categoría.")
    return df

# Función para convertir columnas booleanas a enteros
def convert_booleans_to_int(df):
    """
    Convierte columnas booleanas en el DataFrame a tipo int.
    """
    boolean_columns = df.select_dtypes(include=['bool']).columns
    if len(boolean_columns) > 0:
        df[boolean_columns] = df[boolean_columns].astype(int)
        print(f"✔ Columnas booleanas convertidas a int: {list(boolean_columns)}")
    else:
        print("❌ No se encontraron columnas booleanas.")
    return df

# Función para eliminar columnas constantes
def drop_constant_columns(df, columns_to_drop=None):
    """
    Elimina columnas con valores constantes o columnas especificadas.
    """
    if columns_to_drop:
        for col in columns_to_drop:
            if col in df.columns:
                df = df.drop(columns=[col])
                print(f"✔ Columna '{col}' eliminada.")
            else:
                print(f"❌ Columna '{col}' no encontrada.")

    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    if constant_columns:
        df = df.drop(columns=constant_columns)
        print(f"✔ Columnas constantes eliminadas: {constant_columns}")
    else:
        print("❌ No se encontraron columnas constantes.")
    return df

# Columnas recomendadas para transformaciones
sqrt_columns = [
    'Risk_1930', 'Infraction_SNZ', 'Infraction_QJJF', 'Infraction_FMXQ',
    'Expenditure_FIP', 'Base_0580', 'Base_91828', 'Expenditure_IDZ',
    'Base_22178', 'Infraction_ZVW', 'Infraction_JBR', 'Expenditure_UIWS',
    'Expenditure_MTRQ', 'Expenditure_HPM', 'Infraction_HUK', 'Infraction_QWWW',
    'Infraction_QGR', 'Infraction_LSX', 'Infraction_DNOU'
]
log_columns = ['Infraction_BGGU']

# Columnas categóricas y booleanas
categorical_int_columns = ['Infraction_NMCB', 'Infraction_ZRH', 'Infraction_WIS']
columns_to_drop = ['Expenditure_AHF', 'Base_23737']

# Flujo Principal
# 1. Aplicar transformaciones
df = apply_transformations(df, sqrt_columns, log_columns)

# 2. Convertir columnas categóricas
df = convert_to_category(df, categorical_int_columns)

# 3. Convertir booleanos a enteros
df = convert_booleans_to_int(df)

# 4. Eliminar columnas constantes o específicas
df = drop_constant_columns(df, columns_to_drop)

# 5. Verificación final
print("\nTipos de datos actualizados:")
print(df.dtypes)

print("\nValores nulos por columna:")
print(df.isnull().sum().sort_values(ascending=False))

# Mostrar primeras filas
print("\nPrimeras filas del DataFrame actualizado:")
print(df.head())

✔ Raíz cuadrada aplicada en 'Risk_1930'.
✔ Raíz cuadrada aplicada en 'Infraction_SNZ'.
✔ Raíz cuadrada aplicada en 'Infraction_QJJF'.
✔ Raíz cuadrada aplicada en 'Infraction_FMXQ'.
✔ Raíz cuadrada aplicada en 'Expenditure_FIP'.
✔ Raíz cuadrada aplicada en 'Base_0580'.
✔ Raíz cuadrada aplicada en 'Base_91828'.
✔ Raíz cuadrada aplicada en 'Expenditure_IDZ'.
✔ Raíz cuadrada aplicada en 'Base_22178'.
✔ Raíz cuadrada aplicada en 'Infraction_ZVW'.
✔ Raíz cuadrada aplicada en 'Infraction_JBR'.
✔ Raíz cuadrada aplicada en 'Expenditure_UIWS'.
✔ Raíz cuadrada aplicada en 'Expenditure_MTRQ'.
✔ Raíz cuadrada aplicada en 'Expenditure_HPM'.
✔ Raíz cuadrada aplicada en 'Infraction_HUK'.
✔ Raíz cuadrada aplicada en 'Infraction_QWWW'.
✔ Raíz cuadrada aplicada en 'Infraction_QGR'.
✔ Raíz cuadrada aplicada en 'Infraction_LSX'.
✔ Raíz cuadrada aplicada en 'Infraction_DNOU'.
✔ Logaritmo aplicado en 'Infraction_BGGU'.
✔ Columna 'Infraction_NMCB' convertida a categoría.
✔ Columna 'Infraction_ZRH' convertida 

In [13]:
import pandas as pd

# Lista de columnas seleccionadas
rfe_columns = [
    'Payment_6804', 'Base_80863', 'Expenditure_JIG', 'Base_02683',
    'Infraction_ZWWJ', 'Infraction_QJJF', 'Infraction_EJZ',
    'Infraction_GGO', 'Infraction_TLPJ', 'Base_1165', 'Base_39598',
    'Base_6187', 'Base_85131', 'Risk_9995', 'Infraction_AYWV', 'Base_9516',
    'Expenditure_HMO', 'Infraction_BSU', 'Infraction_ZYW', 'Infraction_TBP',
    'Infraction_PBC', 'Base_0229', 'Base_69608', 'Base_3041',
    'Infraction_QKZN', 'Infraction_CZE', 'Expenditure_MTRQ',
    'Infraction_XEPQ', 'Infraction_RKTA', 'Infraction_KEJT'
]

# 1. Verificar que todas las columnas existen en el DataFrame
missing_columns = [col for col in rfe_columns if col not in df.columns]

if missing_columns:
    print("❌ Las siguientes columnas no existen en el DataFrame:")
    print(missing_columns)
    # Eliminar columnas que no existen para evitar errores
    rfe_columns = [col for col in rfe_columns if col in df.columns]
else:
    print("✔ Todas las columnas existen en el DataFrame.")

# 2. Crear un nuevo DataFrame con las columnas validadas
if rfe_columns:
    df_rfe = df[rfe_columns].copy()
    print(f"✔ DataFrame creado con {len(rfe_columns)} columnas seleccionadas.")
    print(df_rfe.head())
else:
    print("❌ No se encontraron columnas válidas para crear el DataFrame.")

# 3. Exportar el DataFrame a CSV
output_file = 'processed_dataset.csv'
try:
    df_rfe.to_csv(output_file, index=False)
    print(f"✔ DataFrame exportado exitosamente a '{output_file}'.")
except Exception as e:
    print(f"❌ Error al exportar el archivo CSV: {e}")

✔ Todas las columnas existen en el DataFrame.
✔ DataFrame creado con 30 columnas seleccionadas.
                                                    Payment_6804  Base_80863  \
ID                                                                             
33333337027237328070743333423202763537707325232...      0.938469    1.006838   
33333337027237328070743333423202763537707325232...      0.936665    1.000653   
33333337027237328070743333423202763537707325232...      0.954180    1.009672   
33333337027237328070743333423202763537707325232...      0.960384    1.002700   
33333337027237328070743333423202763537707325232...      0.947248    1.000727   

                                                    Expenditure_JIG  \
ID                                                                    
33333337027237328070743333423202763537707325232...         0.124035   
33333337027237328070743333423202763537707325232...         0.126750   
33333337027237328070743333423202763537707325232...         

In [19]:
import joblib
import os
import numpy as np
from sklearn.impute import SimpleImputer

# Verificar si los archivos existen
scaler_file = "scaler.joblib"
model_file = "knn_model.joblib"

if os.path.exists(scaler_file) and os.path.exists(model_file):
    scaler = joblib.load(scaler_file)
    knn_model = joblib.load(model_file)
    print("✔ Scaler y modelo k-NN cargados exitosamente.")
else:
    raise FileNotFoundError("❌ Archivos 'scaler.joblib' o 'knn_model.joblib' no encontrados.")

# 1. Preparar las características
if 'label' in df_rfe.columns:
    df_rfe = df_rfe.drop(columns=['label'])

# Excluir 'ID' si existe
X_rfe = df_rfe.drop(columns=['ID'], errors='ignore')

# 2. Imputar valores faltantes (NaN) usando la mediana
print("🔄 Imputando valores faltantes...")
imputer = SimpleImputer(strategy='median')  # Puedes cambiar 'median' por 'mean' si lo prefieres
X_rfe_imputed = imputer.fit_transform(X_rfe)

# Verificar si la imputación fue exitosa
if np.isnan(X_rfe_imputed).sum() == 0:
    print("✔ Todos los valores NaN han sido imputados correctamente.")
else:
    raise ValueError("❌ Persisten valores NaN después de la imputación.")

# 3. Escalar los datos imputados
try:
    X_rfe_scaled = scaler.transform(X_rfe_imputed)
    print("✔ Datos escalados correctamente.")
except Exception as e:
    raise RuntimeError(f"❌ Error al escalar los datos: {e}")

# 4. Realizar predicciones con el modelo k-NN
try:
    predictions = knn_model.predict(X_rfe_scaled)
    print("✔ Predicciones realizadas con éxito.")
except Exception as e:
    raise RuntimeError(f"❌ Error al predecir con el modelo k-NN: {e}")

# 5. Agregar las predicciones como columna 'label'
df_rfe = df_rfe.reset_index(drop=True)
df_rfe['label'] = predictions
print("✔ Columna 'label' añadida al DataFrame.")

# 6. Crear columna 'ID' si no existe
if 'ID' not in df_rfe.columns:
    print("🔄 Creando la columna 'ID' a partir del índice.")
    df_rfe['ID'] = range(1, len(df_rfe) + 1)

# 7. Crear DataFrame final y exportar
df_rfe_final = df_rfe[['ID', 'label']]
print("\nPrimeras filas del DataFrame final:")
print(df_rfe_final.head())

output_file = 'test_labels.csv'
df_rfe_final.to_csv(output_file, index=False)
print(f"✔ Archivo exportado exitosamente como '{output_file}'.")

✔ Scaler y modelo k-NN cargados exitosamente.
🔄 Imputando valores faltantes...
✔ Todos los valores NaN han sido imputados correctamente.
✔ Datos escalados correctamente.
✔ Predicciones realizadas con éxito.
✔ Columna 'label' añadida al DataFrame.

Primeras filas del DataFrame final:
   ID  label
0   1      0
1   2      0
2   3      0
3   4      0
4   5      0
✔ Archivo exportado exitosamente como 'test_labels.csv'.
