In [29]:
# Importaciones
import pandas as pd
import numpy as np 
from word2number import w2n

# Imputación de nulos usando métodos avanzados estadísticos
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

pd.set_option('display.max_columns', None)

In [30]:
df = pd.read_csv("Alzheimer Disease and Healthy Aging Data In US.csv", low_memory=False)

In [31]:
df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,DataValueTypeID,Data_Value_Type,Data_Value,Data_Value_Alt,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,Geolocation,ClassID,TopicID,QuestionID,LocationID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2
0,2020,2020,HI,Hawaii,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,%,PRCTG,Percentage,26.3,26.3,23.9,29.0,,Age Group,50-64 years,,,POINT (-157.8577494 21.30485044),C01,TOC11,Q43,15,AGE,5064,OVERALL,OVERALL
1,2017,2017,ID,Idaho,BRFSS,Mental Health,Lifetime diagnosis of depression,Percentage of older adults with a lifetime dia...,%,PRCTG,Percentage,24.0,24.0,21.2,27.0,,Age Group,50-64 years,,,POINT (-114.36373 43.68263001),C05,TMC03,Q27,16,AGE,5064,OVERALL,OVERALL
2,2017,2017,ID,Idaho,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,%,PRCTG,Percentage,35.7,35.7,32.5,39.1,,Age Group,50-64 years,,,POINT (-114.36373 43.68263001),C01,TOC11,Q43,16,AGE,5064,OVERALL,OVERALL
3,2018,2018,ID,Idaho,BRFSS,Overall Health,Physically unhealthy days (mean number of days),Physically unhealthy days (mean number of days...,Number,MEAN,Mean,4.8,4.8,4.0,5.5,,Age Group,50-64 years,,,POINT (-114.36373 43.68263001),C01,TOC01,Q08,16,AGE,5064,OVERALL,OVERALL
4,2020,2020,IN,Indiana,BRFSS,Mental Health,Lifetime diagnosis of depression,Percentage of older adults with a lifetime dia...,%,PRCTG,Percentage,13.7,13.7,12.1,15.4,,Age Group,Overall,Gender,Male,POINT (-86.14996019 39.76691045),C05,TMC03,Q27,18,AGE,AGE_OVERALL,GENDER,MALE


In [32]:
df["Sample_Size"].unique()

array([nan])

In [33]:
df.columns

Index(['YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'Datasource',
       'Class', 'Topic', 'Question', 'Data_Value_Unit', 'DataValueTypeID',
       'Data_Value_Type', 'Data_Value', 'Data_Value_Alt',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'Sample_Size',
       'StratificationCategory1', 'Stratification1', 'StratificationCategory2',
       'Stratification2', 'Geolocation', 'ClassID', 'TopicID', 'QuestionID',
       'LocationID', 'StratificationCategoryID1', 'StratificationID1',
       'StratificationCategoryID2', 'StratificationID2'],
      dtype='object')

In [34]:
#FUNCIONES EXPLORACION#

def exploracion_general (dataframe):
    """Esta función proporciona una descripción personalizada de un DataFrame,
    incluyendo estadísticas descriptivas y tipos de datos de cada columna.
    
    Args:
    df : El DataFrame para el cual se generará la descripción

    Returns:
    La funcion no tiene return pero devuelve varios prints con
    la informacion que necesitamos:
    - describe separados por col numericas y categoricas
    - dtypes por columna
    - shape
    - info
    - total de nulos
    - total de duplicados)"""
       
    print(f"------EXPLORACION DATAFRAME ABC CORPORATION------")
    print("-------Descripción numéricas:---------")
    print(dataframe.describe())
    print("-------Descripción categoricas:---------")
    print(dataframe.describe(include="O"))
    print("------Tipos:---------")
    print(dataframe.dtypes)
    print("------Forma del DataFrame:------")
    print(dataframe.shape)
    print("------Información:---------")
    print(dataframe.info())
    print("------Nulos:---------")
    print(dataframe.isnull().sum())
    print("------Duplicados:---------")
    print(dataframe.duplicated().sum())
    
def exploracion_columna (dataframe):

    for columna in list(dataframe.columns):

        print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
        print(f"* Nº de datos: {len(dataframe[columna].to_list())}")
        print(f"* Frecuencia de valores en la columna: \n {dataframe[columna].value_counts()}")
        print(f"* Datos unicos en la columna {len(dataframe[columna].unique())}")
        print(f"* Los valores son de tipo: {type(columna)}")
        print(f"La suma de datos nulos {dataframe[columna].isnull().sum()}")
        print(dataframe[columna].unique()) 

In [35]:
exploracion_general(df)

------EXPLORACION DATAFRAME ABC CORPORATION------
-------Descripción numéricas:---------
           YearStart        YearEnd     Data_Value  Data_Value_Alt  \
count  214462.000000  214462.000000  144629.000000   144629.000000   
mean     2017.378477    2017.634000      37.341956       37.341956   
std         1.779822       1.778926      25.183017       25.183017   
min      2015.000000    2015.000000       0.000000        0.000000   
25%      2016.000000    2016.000000      15.300000       15.300000   
50%      2017.000000    2018.000000      32.500000       32.500000   
75%      2019.000000    2019.000000      56.800000       56.800000   
max      2020.000000    2020.000000     100.000000      100.000000   

       Sample_Size     LocationID  
count          0.0  214462.000000  
mean           NaN     800.987821  
std            NaN    2512.934094  
min            NaN       1.000000  
25%            NaN      18.000000  
50%            NaN      33.000000  
75%            NaN      49.0

In [36]:
exploracion_columna(df)

 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'YEARSTART' -----------

* Nº de datos: 214462
* Frecuencia de valores en la columna: 
 2015    45980
2020    36006
2019    34354
2016    34145
2017    33429
2018    30548
Name: YearStart, dtype: int64
* Datos unicos en la columna 6
* Los valores son de tipo: <class 'str'>
La suma de datos nulos 0
[2020 2017 2018 2019 2015 2016]
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'YEAREND' -----------

* Nº de datos: 214462
* Frecuencia de valores en la columna: 
 2020    46966
2015    35020
2019    34354
2016    34145
2017    33429
2018    30548
Name: YearEnd, dtype: int64
* Datos unicos en la columna 6
* Los valores son de tipo: <class 'str'>
La suma de datos nulos 0
[2020 2017 2018 2019 2015 2016]
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'LOCATIONABBR' -----------

* Nº de datos: 214462
* Frecuencia de valores en la columna: 
 US      4644
WEST    4638
NRE     4614
MDW     4611
OR      4565
NY      4557
SOU     4542
UT      4222
OH      3955

In [37]:
columns_to_drop = ["LocationAbbr", "Data_Value_Unit", "Data_Value_Type", "Data_Value_Alt", 
                   "Sample_Size", "StratificationCategory2", "Stratification2", 
                   "StratificationCategoryID1", "StratificationID1", "StratificationCategory1"]

# Eliminar las columnas especificadas
df = df.drop(columns=columns_to_drop)

In [38]:
df.columns

Index(['YearStart', 'YearEnd', 'LocationDesc', 'Datasource', 'Class', 'Topic',
       'Question', 'DataValueTypeID', 'Data_Value', 'Low_Confidence_Limit',
       'High_Confidence_Limit', 'Stratification1', 'Geolocation', 'ClassID',
       'TopicID', 'QuestionID', 'LocationID', 'StratificationCategoryID2',
       'StratificationID2'],
      dtype='object')

In [39]:
# Gestión de duplicados

df.drop_duplicates(keep = "first", inplace = True)


In [40]:
df.duplicated().sum()

0

In [41]:
# Gestión de nulos
# df = df.dropna()  # Elimina filas con valores nulos

In [42]:
df.isnull().sum() / df.shape[0] * 100

YearStart                     0.000000
YearEnd                       0.000000
LocationDesc                  0.000000
Datasource                    0.000000
Class                         0.000000
Topic                         0.000000
Question                      0.000000
DataValueTypeID               0.000000
Data_Value                   32.561946
Low_Confidence_Limit         32.635152
High_Confidence_Limit        32.635152
Stratification1               0.000000
Geolocation                  10.747359
ClassID                       0.000000
TopicID                       0.000000
QuestionID                    0.000000
LocationID                    0.000000
StratificationCategoryID2     0.000000
StratificationID2             0.000000
dtype: float64

In [43]:
df[df["Question"] == "Percentage of older adults who are currently obese, with a body mass index (BMI) of 30 or more"].head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,Datasource,Class,Topic,Question,DataValueTypeID,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Stratification1,Geolocation,ClassID,TopicID,QuestionID,LocationID,StratificationCategoryID2,StratificationID2
49,2017,2017,Alaska,BRFSS,Nutrition/Physical Activity/Obesity,Obesity,Percentage of older adults who are currently o...,PRCTG,,,,65 years or older,POINT (-147.722059 64.84507996),C02,TNC04,Q13,2,RACE,HIS
56,2017,2017,Arizona,BRFSS,Nutrition/Physical Activity/Obesity,Obesity,Percentage of older adults who are currently o...,PRCTG,54.6,44.7,64.2,50-64 years,POINT (-111.7638113 34.86597028),C02,TNC04,Q13,4,RACE,NAA
59,2020,2020,Alaska,BRFSS,Nutrition/Physical Activity/Obesity,Obesity,Percentage of older adults who are currently o...,PRCTG,33.0,26.8,39.8,65 years or older,POINT (-147.722059 64.84507996),C02,TNC04,Q13,2,GENDER,MALE
93,2020,2020,Alaska,BRFSS,Nutrition/Physical Activity/Obesity,Obesity,Percentage of older adults who are currently o...,PRCTG,42.1,33.5,51.3,Overall,POINT (-147.722059 64.84507996),C02,TNC04,Q13,2,RACE,NAA
99,2018,2018,Alaska,BRFSS,Nutrition/Physical Activity/Obesity,Obesity,Percentage of older adults who are currently o...,PRCTG,32.4,28.9,36.2,Overall,POINT (-147.722059 64.84507996),C02,TNC04,Q13,2,RACE,WHT


In [44]:
df[df["LocationDesc"] == "West"]

Unnamed: 0,YearStart,YearEnd,LocationDesc,Datasource,Class,Topic,Question,DataValueTypeID,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Stratification1,Geolocation,ClassID,TopicID,QuestionID,LocationID,StratificationCategoryID2,StratificationID2
23,2020,2020,West,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,PRCTG,41.9,40.4,43.4,Overall,,C01,TOC11,Q43,9004,RACE,WHT
140776,2020,2020,West,BRFSS,Overall Health,Recent activity limitations in past month,Mean number of days with activity limitations ...,MEAN,6.3,5.7,6.8,Overall,,C01,TOC03,Q35,9004,OVERALL,OVERALL
150060,2019,2019,West,BRFSS,Overall Health,Physically unhealthy days (mean number of days),Physically unhealthy days (mean number of days...,MEAN,4.9,4.7,5.1,Overall,,C01,TOC01,Q08,9004,RACE,WHT
150078,2019,2019,West,BRFSS,Overall Health,Physically unhealthy days (mean number of days),Physically unhealthy days (mean number of days...,MEAN,5.7,5.2,6.3,50-64 years,,C01,TOC01,Q08,9004,RACE,HIS
150298,2020,2020,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,PRCTG,13.7,9,20.5,65 years or older,,C05,TMC01,Q03,9004,RACE,HIS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211547,2017,2017,West,BRFSS,Screenings and Vaccines,Diabetes screening within past 3 years,Percentage of older adults without diabetes wh...,PRCTG,63.5,56.9,69.5,50-64 years,,C03,TSC04,Q19,9004,RACE,NAA
211548,2015,2015,West,BRFSS,Overall Health,Self-rated health (fair to poor health),Percentage of older adults who self-reported t...,PRCTG,47.6,42.7,52.5,65 years or older,,C01,TOC07,Q32,9004,RACE,HIS
211549,2015,2015,West,BRFSS,Caregiving,Intensity of caregiving among older adults,Average of 20 or more hours of care per week p...,PRCTG,44.3,24.4,66.3,50-64 years,,C07,TGC04,Q39,9004,RACE,HIS
211555,2015,2015,West,BRFSS,Screenings and Vaccines,Diabetes screening within past 3 years,Percentage of older adults without diabetes wh...,PRCTG,63.1,34.4,84.8,50-64 years,,C03,TSC04,Q19,9004,RACE,BLK


In [45]:
df['Iguales'] = df['YearStart'] == df['YearEnd']

In [46]:
df['Iguales'].value_counts()

True     203502
False     10960
Name: Iguales, dtype: int64

In [47]:
df.columns

Index(['YearStart', 'YearEnd', 'LocationDesc', 'Datasource', 'Class', 'Topic',
       'Question', 'DataValueTypeID', 'Data_Value', 'Low_Confidence_Limit',
       'High_Confidence_Limit', 'Stratification1', 'Geolocation', 'ClassID',
       'TopicID', 'QuestionID', 'LocationID', 'StratificationCategoryID2',
       'StratificationID2', 'Iguales'],
      dtype='object')

In [48]:
# Renombrar columnas
df = df.rename(columns= {'YearStart':'Year_Start',
'YearEnd':'Year_End', 
'LocationDesc':'Location',
'Datasource':'Data_Source',
'Class':'Class', 
'Topic':'Topic',
'Question':'Question', 
'DataValueTypeID':'Data_Value_Type', 
'Data_Value':'Data_Value',
'Low_Confidence_Limit':'Low_Confidence_Limit',
'High_Confidence_Limit':'High_Confidence_Limit', 
'Stratification1':'Age',
'Geolocation':'Geolocation', 
'ClassID':'ClassID',
'TopicID':'TopicID',
'QuestionID':'QuestionID', 
'LocationID':'LocationID', 
'StratificationCategoryID2':'Category',
'StratificationID2':'Gender_or_Race', 
'Iguales':'Same_Start_End_Year'})

In [49]:
df.head()

Unnamed: 0,Year_Start,Year_End,Location,Data_Source,Class,Topic,Question,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Age,Geolocation,ClassID,TopicID,QuestionID,LocationID,Category,Gender_or_Race,Same_Start_End_Year
0,2020,2020,Hawaii,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,PRCTG,26.3,23.9,29.0,50-64 years,POINT (-157.8577494 21.30485044),C01,TOC11,Q43,15,OVERALL,OVERALL,True
1,2017,2017,Idaho,BRFSS,Mental Health,Lifetime diagnosis of depression,Percentage of older adults with a lifetime dia...,PRCTG,24.0,21.2,27.0,50-64 years,POINT (-114.36373 43.68263001),C05,TMC03,Q27,16,OVERALL,OVERALL,True
2,2017,2017,Idaho,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,PRCTG,35.7,32.5,39.1,50-64 years,POINT (-114.36373 43.68263001),C01,TOC11,Q43,16,OVERALL,OVERALL,True
3,2018,2018,Idaho,BRFSS,Overall Health,Physically unhealthy days (mean number of days),Physically unhealthy days (mean number of days...,MEAN,4.8,4.0,5.5,50-64 years,POINT (-114.36373 43.68263001),C01,TOC01,Q08,16,OVERALL,OVERALL,True
4,2020,2020,Indiana,BRFSS,Mental Health,Lifetime diagnosis of depression,Percentage of older adults with a lifetime dia...,PRCTG,13.7,12.1,15.4,Overall,POINT (-86.14996019 39.76691045),C05,TMC03,Q27,18,GENDER,MALE,True


In [50]:
df[df["Data_Value"] == 100]

Unnamed: 0,Year_Start,Year_End,Location,Data_Source,Class,Topic,Question,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Age,Geolocation,ClassID,TopicID,QuestionID,LocationID,Category,Gender_or_Race,Same_Start_End_Year
80853,2020,2020,Midwest,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,.,.,50-64 years,,C06,TCC02,Q31,9002,RACE,ASN,True
83960,2020,2020,Midwest,BRFSS,Cognitive Decline,Need assistance with day-to-day activities bec...,Percentage of older adults who reported that a...,PRCTG,100.0,.,.,50-64 years,,C06,TCC03,Q41,9002,RACE,ASN,True
88697,2020,2020,Midwest,BRFSS,Caregiving,Expect to provide care for someone in the next...,Percentage of older adults currently not provi...,PRCTG,100.0,.,.,65 years or older,,C07,TGC02,Q37,9002,RACE,ASN,True
134846,2020,2020,Northeast,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,.,.,50-64 years,,C06,TCC02,Q31,9001,RACE,ASN,True
135630,2019,2019,Northeast,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,.,.,65 years or older,,C06,TCC02,Q31,9001,RACE,NAA,True
137152,2020,2020,Northeast,BRFSS,Caregiving,Intensity of caregiving among older adults,Average of 20 or more hours of care per week p...,PRCTG,100.0,.,.,50-64 years,,C07,TGC04,Q39,9001,RACE,NAA,True
137341,2020,2020,Northeast,BRFSS,Caregiving,Duration of caregiving among older adults,Percentage of older adults who provided care t...,PRCTG,100.0,.,.,50-64 years,,C07,TGC03,Q38,9001,RACE,NAA,True
137846,2020,2020,Northeast,BRFSS,Caregiving,Intensity of caregiving among older adults,Average of 20 or more hours of care per week p...,PRCTG,100.0,.,.,Overall,,C07,TGC04,Q39,9001,RACE,NAA,True
138053,2020,2020,Northeast,BRFSS,Caregiving,Duration of caregiving among older adults,Percentage of older adults who provided care t...,PRCTG,100.0,.,.,Overall,,C07,TGC03,Q38,9001,RACE,NAA,True
138086,2020,2020,Northeast,BRFSS,Cognitive Decline,Talked with health care professional about sub...,Percentage of older adults with subjective cog...,PRCTG,100.0,.,.,50-64 years,,C06,TCC04,Q42,9001,RACE,ASN,True


In [51]:
def replace_dot (columna):

    
    diccionario_mapeo = {".": np.nan}
    try:
        df[columna] = df[columna].map(diccionario_mapeo)

    except:
        print(f"Error al transformar la columna {columna}")

In [52]:
replace_dot("Low_Confidence_Limit")
replace_dot("High_Confidence_Limit")

In [53]:
df[df["Data_Value"] == 100]

Unnamed: 0,Year_Start,Year_End,Location,Data_Source,Class,Topic,Question,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Age,Geolocation,ClassID,TopicID,QuestionID,LocationID,Category,Gender_or_Race,Same_Start_End_Year
80853,2020,2020,Midwest,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,,,50-64 years,,C06,TCC02,Q31,9002,RACE,ASN,True
83960,2020,2020,Midwest,BRFSS,Cognitive Decline,Need assistance with day-to-day activities bec...,Percentage of older adults who reported that a...,PRCTG,100.0,,,50-64 years,,C06,TCC03,Q41,9002,RACE,ASN,True
88697,2020,2020,Midwest,BRFSS,Caregiving,Expect to provide care for someone in the next...,Percentage of older adults currently not provi...,PRCTG,100.0,,,65 years or older,,C07,TGC02,Q37,9002,RACE,ASN,True
134846,2020,2020,Northeast,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,,,50-64 years,,C06,TCC02,Q31,9001,RACE,ASN,True
135630,2019,2019,Northeast,BRFSS,Cognitive Decline,Functional difficulties associated with subjec...,Percentage of older adults who reported subjec...,PRCTG,100.0,,,65 years or older,,C06,TCC02,Q31,9001,RACE,NAA,True
137152,2020,2020,Northeast,BRFSS,Caregiving,Intensity of caregiving among older adults,Average of 20 or more hours of care per week p...,PRCTG,100.0,,,50-64 years,,C07,TGC04,Q39,9001,RACE,NAA,True
137341,2020,2020,Northeast,BRFSS,Caregiving,Duration of caregiving among older adults,Percentage of older adults who provided care t...,PRCTG,100.0,,,50-64 years,,C07,TGC03,Q38,9001,RACE,NAA,True
137846,2020,2020,Northeast,BRFSS,Caregiving,Intensity of caregiving among older adults,Average of 20 or more hours of care per week p...,PRCTG,100.0,,,Overall,,C07,TGC04,Q39,9001,RACE,NAA,True
138053,2020,2020,Northeast,BRFSS,Caregiving,Duration of caregiving among older adults,Percentage of older adults who provided care t...,PRCTG,100.0,,,Overall,,C07,TGC03,Q38,9001,RACE,NAA,True
138086,2020,2020,Northeast,BRFSS,Cognitive Decline,Talked with health care professional about sub...,Percentage of older adults with subjective cog...,PRCTG,100.0,,,50-64 years,,C06,TCC04,Q42,9001,RACE,ASN,True


In [54]:
# Guardar el cvs limpio
df.to_csv('alzheimer_data_cleaned.csv', index=False)