In [1]:
# Importaciones
import pandas as pd
import numpy as np 
from word2number import w2n

# Imputación de nulos usando métodos avanzados estadísticos
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("Mental Health Dataset.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:31,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:50,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:39,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:59,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:36,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [4]:
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options'],
      dtype='object')

In [5]:
#FUNCIONES EXPLORACION#

def exploracion_general (dataframe):
    """Esta función proporciona una descripción personalizada de un DataFrame,
    incluyendo estadísticas descriptivas y tipos de datos de cada columna.
    
    Args:
    df : El DataFrame para el cual se generará la descripción

    Returns:
    La funcion no tiene return pero devuelve varios prints con
    la informacion que necesitamos:
    - describe separados por col numericas y categoricas
    - dtypes por columna
    - shape
    - info
    - total de nulos
    - total de duplicados)"""
       
    print(f"------EXPLORACION DATAFRAME ABC CORPORATION------")
    print("-------Descripción numéricas:---------")
    print(dataframe.describe())
    print("-------Descripción categoricas:---------")
    print(dataframe.describe(include="O"))
    print("------Tipos:---------")
    print(dataframe.dtypes)
    print("------Forma del DataFrame:------")
    print(dataframe.shape)
    print("------Información:---------")
    print(dataframe.info())
    print("------Nulos:---------")
    print(dataframe.isnull().sum())
    print("------Duplicados:---------")
    print(dataframe.duplicated().sum())
    
def exploracion_columna (dataframe):

    for columna in list(dataframe.columns):

        print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
        print(f"* Nº de datos: {len(dataframe[columna].to_list())}")
        print(f"* Frecuencia de valores en la columna: \n {dataframe[columna].value_counts()}")
        print(f"* Datos unicos en la columna {len(dataframe[columna].unique())}")
        print(f"* Los valores son de tipo: {type(columna)}")
        print(f"La suma de datos nulos {dataframe[columna].isnull().sum()}")
        print(dataframe[columna].unique()) 

In [6]:
exploracion_general(df)

------EXPLORACION DATAFRAME ABC CORPORATION------
-------Descripción numéricas:---------
                  Timestamp  Gender        Country Occupation self_employed  \
count                292364  292364         292364     292364        287162   
unique                  734       2             35          5             2   
top     2014-08-27 12:31:41    Male  United States  Housewife            No   
freq                    780  239850         171308      66351        257994   

       family_history treatment Days_Indoors Growing_Stress Changes_Habits  \
count          292364    292364       292364         292364         292364   
unique              2         2            5              3              3   
top                No       Yes    1-14 days          Maybe            Yes   
freq           176832    147606        63548          99985         109523   

       Mental_Health_History Mood_Swings Coping_Struggles Work_Interest  \
count                 292364      292364         

In [7]:
exploracion_columna(df)

 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'TIMESTAMP' -----------

* Nº de datos: 292364
* Frecuencia de valores en la columna: 
 2014-08-27 12:31:41    780
2014-08-27 12:54:11    780
2014-08-27 11:29:31    434
2014-08-28 17:20:41    434
2014-08-28 13:41:51    434
                      ... 
2014-08-27 15:27:38    390
2014-08-27 15:27:39    390
2014-08-27 15:28:34    390
2014-08-27 15:31:18    390
2016-02-01 23:04:31    390
Name: Timestamp, Length: 734, dtype: int64
* Datos unicos en la columna 734
* Los valores son de tipo: <class 'str'>
La suma de datos nulos 0
['2014-08-27 11:29:31' '2014-08-27 11:31:50' '2014-08-27 11:32:39'
 '2014-08-27 11:37:59' '2014-08-27 11:43:36' '2014-08-27 11:49:51'
 '2014-08-27 11:51:34' '2014-08-27 11:52:41' '2014-08-27 12:18:38'
 '2014-08-27 12:37:50' '2014-08-27 12:39:18' '2014-08-27 12:40:06'
 '2014-08-27 12:49:27' '2014-08-27 12:49:30' '2014-08-27 12:50:57'
 '2014-08-27 12:51:51' '2014-08-27 12:53:13' '2014-08-27 12:53:15'
 '2014-08-27 12:55:01' '20

In [8]:
df["Gender"].unique()

array(['Female', 'Male'], dtype=object)

In [9]:
df["Country"].unique()

array(['United States', 'Poland', 'Australia', 'Canada', 'United Kingdom',
       'South Africa', 'Sweden', 'New Zealand', 'Netherlands', 'India',
       'Belgium', 'Ireland', 'France', 'Portugal', 'Brazil', 'Costa Rica',
       'Russia', 'Germany', 'Switzerland', 'Finland', 'Israel', 'Italy',
       'Bosnia and Herzegovina', 'Singapore', 'Nigeria', 'Croatia',
       'Thailand', 'Denmark', 'Mexico', 'Greece', 'Moldova', 'Colombia',
       'Georgia', 'Czech Republic', 'Philippines'], dtype=object)

In [10]:
df["Occupation"].unique()

array(['Corporate', 'Student', 'Business', 'Housewife', 'Others'],
      dtype=object)

In [11]:
df["Days_Indoors"].unique()

array(['1-14 days', 'Go out Every day', 'More than 2 months',
       '15-30 days', '31-60 days'], dtype=object)

In [12]:
df.duplicated().sum()

363

In [13]:
# Gestión de duplicados
df.drop_duplicates(keep = "first", inplace = True) # Habia 363

In [14]:
df.duplicated().sum()

0

In [22]:
# Gestión de nulos  -  Columna self_employed
df["self_employed"].unique()

array([nan, 'No', 'Yes'], dtype=object)

In [23]:
# Reemplazar Nan por Unknown
df["self_employed"] = df["self_employed"].fillna("Unknown")

In [24]:
# Verificación de nulos
df.isnull().sum() / df.shape[0] * 100

Timestamp                  0.0
Gender                     0.0
Country                    0.0
Occupation                 0.0
self_employed              0.0
family_history             0.0
treatment                  0.0
Days_Indoors               0.0
Growing_Stress             0.0
Changes_Habits             0.0
Mental_Health_History      0.0
Mood_Swings                0.0
Coping_Struggles           0.0
Work_Interest              0.0
Social_Weakness            0.0
mental_health_interview    0.0
care_options               0.0
dtype: float64

In [25]:
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options'],
      dtype='object')

In [26]:
# Renombrar columnas primera letra en Mayúsculas
df = df.rename(columns= {'Timestamp': 'Timestamp',
    'Gender': 'Gender',
    'Country': 'Country',
    'Occupation': 'Occupation',
    'self_employed': 'Self_employed',
    'family_history': 'Family_history',
    'treatment': 'Treatment',
    'Days_Indoors': 'Days_Indoors',
    'Growing_Stress': 'Growing_Stress',
    'Changes_Habits': 'Changes_Habits',
    'Mental_Health_History': 'Mental_Health_History',
    'Mood_Swings': 'Mood_Swings',
    'Coping_Struggles': 'Coping_Struggles',
    'Work_Interest': 'Work_Interest',
    'Social_Weakness': 'Social_Weakness',
    'mental_health_interview': 'Mental_health_interview',
    'care_options': 'Care_options'})

In [28]:
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'Self_employed',
       'Family_history', 'Treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'Mental_health_interview', 'Care_options'],
      dtype='object')

In [29]:
# Guardar el cvs limpio
df.to_csv('Mental Health Clean.csv', index=False)