<h1 style="color:blue"> IMPORTAR LIBRERIAS </h1>

In [3]:
# Librerías
import sys
import warnings
import pathlib
from termcolor import colored

# Librerías para manipulación de datos
import pandas as pd
import numpy as np
from scipy import stats
import re
import unicodedata
import nltk
import unicodedata
from random import sample
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk import FreqDist
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Librerías para visualización de datos
import plotly
import plotly.graph_objects as go
import plotly.express as px
import cufflinks as cf
import stylecloud
from PIL import Image
from plotly.offline import plot,iplot
pd.options.plotting.backend = "plotly"
cf.go_offline()
pd.set_option("display.max_columns",200)

<h1 style="color:blue"> FUNCIONES PARA MANIPULACIÓN DE DATOS </h1>

### FUNCIÓN PARA ETIQUETAR VARIABLES

In [2]:
'''
This function is used to rename columns
according to its type.
'''
def label_columns(df,feats,prefix):

    """labels columns' names with the given prefix.

    Parameters
    ----------
    df : pandas.DataFrame
        Data frame whose columns will be labeled.
    feats : list of strings
        list with column names to label.
    prefix : string
        string prefix to add at the begining of coumn names.

    Returns
    -------
    pandas.DataFrame
        Returns the same dataframe from the input with column names labeled.
    """
    feats_new=[prefix+x for x in feats]
    df=df.rename(columns=dict(zip(feats,feats_new)))
    return df

### FUNCIÓN PARA EVALUAR COMPLETITUD

In [3]:
def completitud(df):
    """Checks percentage of non missing values.

    Parameters
    ----------
    df : pandas.DataFrame
        Data

    Returns
    -------
    pandas.DataFrame
        dataframe with the columns of:
            columna: column
            total: total number of missings
            completitud: percentage of non missing values
    """
    comp=pd.DataFrame(df.isnull().sum())
    comp.reset_index(inplace=True)
    comp=comp.rename(columns={"index":"columna",0:"total"})
    comp["completitud"]=(1-comp["total"]/df.shape[0])*100
    comp=comp.sort_values(by="completitud",ascending=True)
    comp.reset_index(drop=True,inplace=True)
    return comp

### FUNCIÓN PARA HOMOLOGAR 'SEXO'

In [6]:
def sex(texto):
        if texto == 'M' or texto == 'F':
            return texto
        else:
            return 'NaN'

### FUNCIÓN PARA GRÁFICO DE BARRAS - CONTEO

In [2]:
def my_bar_count(df,x,title="",x_title="",y_title=""):
    """ counts categories in the variable and generates plotly bar plot

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to extract data from
    col : [string
        column from data frame to plot
    title : str, optional
        title of the plot, by default ""
    x_title : str, optional
        x axis title, by default ""
    y_title : str, optional
        y axis title, by default ""

    Returns
    -------
    plotly figure
    """
    layout = go.Layout(font_family="Droid Sans, monospace",
        font_color="black",title_text=title,title_font_size=22,
        xaxis= {"title": {"text": x_title,"font": {"family": 'Droid Sans, monospace',"size": 13, "color": '#0531a9'}}},
        yaxis= {"title": {"text": y_title,"font": {"family": 'Droid Sans, monospace',"size": 13, "color": '#0531a9'}}},
        title_font_family='Droid Sans Mono',title_font_color="#0531a9",
        template="plotly", plot_bgcolor="rgb(219,229,239)")
    aux=pd.DataFrame(df[x].value_counts()).reset_index().rename(columns={"index":"conteo"})
    fig=aux.iplot(kind='bar',x="conteo",y=x,title=title,asFigure=True,barmode="overlay",sortbars=True,color='#102568',layout=layout,width=5,annotations=True)
    fig.update_layout(width=800)
    fig.update_traces(marker_color='#358bdf',opacity=0.9)
    return fig

### FUNCIÓN PARA HISTOGRAMA

In [4]:
def my_histogram(df,col,bins,title="",x_title="",y_title="conteo"):
    """generates plotly histogram

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to extract data from
    col : [string
        column from data frame to plot
    bins : int
        number of bins for histogram
    title : str, optional
        title of the plot, by default ""
    x_title : str, optional
        x axis title, by default ""
    y_title : str, optional
        y axis title, by default "conteo"

    Returns
    -------
    plotly figure
    """
    layout = go.Layout(font_family="Courier New, monospace",
        font_color="black",title_text=title,title_font_size=20,
        xaxis= {"title": {"text": x_title,"font": {"family": 'Courier New, monospace',"size":12,"color": '#002e4d'}}},
        yaxis= {"title": {"text": y_title,"font": {"family": 'Courier New, monospace',"size": 12, "color": '#002e4d'}}},               
        title_font_family="Arial",title_font_color="#002020",
        template="plotly_white", plot_bgcolor="rgb(168,168,168)")
    fig=df[[col]].iplot(kind='histogram',x=col,bins=bins,title=title,asFigure=True,layout=layout,sortbars=True,linecolor='#2b2b2b')
    fig.update_traces(marker_color='#045C8C',opacity=0.7)
    return fig

### FUNCIÓN GRÁFICO DE CAJAS Y BIGOTES

In [5]:
def my_box(df,columns,values,title="",x_title="",y_title=""):
    """ generates plotly box plot

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to extract data from
    columns : string
        column that defines independent values (categories) of the plot
    values  : strings
        column that defines dependent values (values' distribution) of the plot
    title : str, optional
        title of the plot, by default ""
    x_title : str, optional
        x axis title, by default ""
    y_title : str, optional
        y axis title, by default ""

    Returns
    -------
    plotly figure
    """
    colors=['#4676d0','#19293c','#6faa9f','#ccceb1','#344647','#02160f','#779a7c','#070919','#2b2b2b','#121212']
    layout = go.Layout(font_family="Courier New, monospace",
        font_color="black",title_text=title,title_font_size=20,
        xaxis= {"title": {"text": x_title,"font": {"family": 'Courier New, monospace',"size": 12,"color": '#002e4d'}}},
        yaxis= {"title": {"text": y_title,"font": {"family": 'Courier New, monospace',"size": 12, "color": '#002e4d'}}},
        title_font_family="Arial",title_font_color="#002020",
        template="plotly_white", plot_bgcolor="rgb(208,208,208)")
    fig=df.pivot(columns=columns,values=values).iplot(kind='box',title=title,asFigure=True,theme="white",layout=layout,color=colors)
    return fig

### FUNCIÓN PARA GRÁFICO DE PASTEL

In [6]:
def my_pie_count(df,col,title=""):
    """ counts categories in the variable and generates plotly pie plot

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to extract data from
    col : string
        column from data frame to plot
    title : str, optional
        title of the plot, by default ""

    Returns
    -------
    plotly figure
    """
    layout = go.Layout(template="plotly_white")
    colors=['#017664','#152228','#e7eee7','#102568','#d31717','#5747aa','#9b86ec','#9b86ec','#3c4b9b','#8f1b22','#121212']
    aux=pd.DataFrame(df[col].value_counts()).reset_index().rename(columns={"index":"conteo"})
    fig=aux.iplot(kind='pie',labels='conteo',values=col,title=title,asFigure=True,theme="white")
    fig.update_traces(textfont_size=14,opacity=0.65,
                  marker=dict(colors=colors))
    fig.update_layout(font_family="Courier New, monospace",
    font_color="blue",title_text=title,title_font_size=23,title_font_family="Droid Sans",title_font_color="#0531a9",template="plotly_white")
    return fig

### FUNCIÓN PARA IDENTIFICAR OUTLIERS

In [17]:
def OUTLIERS(data,cols):
    """searches for outliers with three different methods and returns dataframe with information of matched outliers.

    Parameters
    ----------
    data : pandas.DataFrame
        data frame to be analyzed for outliers.
    cols : list of strings
        columns to analyze for outliers

    Returns
    -------
    pandas.DataFrame
        Data frame with outlier information.
    """
    df=data.copy()
    results=pd.DataFrame()
    data_iqr=df.copy()
    data_per=df.copy()
    total=[]
    total_per=[]
    total_z=[]
    indices_=[]

    for col in cols:
        #IQR
        Q1=df[col].quantile(0.25)
        Q3=df[col].quantile(0.75)
        IQR=Q3-Q1
        INF=Q1-1.5*(IQR)
        SUP=Q3+1.5*(IQR)
    
        
        n_outliers=df[(df[col] < INF) | (df[col] > SUP)].shape[0]
        total.append(n_outliers)
        indices_iqr=list(df[(df[col] < INF) | (df[col] > SUP)].index)
        #data_iqr=data_iqr[~(data_iqr[col] < INF) | (data_iqr[col] > SUP)].reset_index(drop=True)
        
        #Percentiles
        INF_pe=np.percentile(df[col].dropna(),5)
    
        SUP_pe=np.percentile(df[col].dropna(),95)
        n_outliers_per=df[(df[col] < INF_pe) | (df[col] > SUP_pe)].shape[0]
        total_per.append(n_outliers_per)
        indices_per=list(df[(df[col] < INF_pe) | (df[col] > SUP_pe)].index)
        #data_per=data_per[~(data_per[col] < INF_pe) | (data_per[col] > SUP_pe)].reset_index(drop=True)
        
        #Z-Score
        
        z=np.abs(stats.zscore(df[col],nan_policy='omit'))
        #df[f"zscore_{col}"]=abs((df[col] - df[col].mean())/df[col].std(ddof=0))
        total_z.append(df[[col]][(z>=3)].shape[0])
        indices_z=list(df[[col]][(z>=3)].index)
        
        indices_.append(aux_outliers(indices_iqr,indices_per,indices_z))
        
    results["features"]=cols
    results["n_outliers_IQR"]=total
    results["n_outliers_Percentil"]=total_per
    results["n_outliers_Z_Score"]=total_z
    results["n_outliers_IQR_%"]=round((results["n_outliers_IQR"]/df.shape[0])*100,2)
    results["n_outliers_Percentil_%"]=round((results["n_outliers_Percentil"]/df.shape[0])*100,2)
    results["n_outliers_Z_Score_%"]=round((results["n_outliers_Z_Score"]/df.shape[0])*100,2)
    results["indices"]=indices_
    results["total_outliers"]=results["indices"].map(lambda x:len(x))
    results["%_outliers"]=results["indices"].map(lambda x:round(((len(x)/df.shape[0])*100),2))
    results=results[['features', 'n_outliers_IQR', 'n_outliers_Percentil',
       'n_outliers_Z_Score', 'n_outliers_IQR_%', 'n_outliers_Percentil_%',
       'n_outliers_Z_Score_%',  'total_outliers', '%_outliers','indices']]
    return results

In [18]:
def aux_outliers(a,b,c):
    """auxiliary function for the OUTLIERS function. It gets the conjunction of index sets obtained from different methods used in the OUTLIERS function.

    Parameters
    ----------
    a : int list
        list of indexes
    b : int list
        list of indexes
    c : int list
        list of indexes

    Returns
    -------
    list
        returns list with unique indexes
    """
    a=set(a)
    b=set(b)
    c=set(c)
    
    a_=a.intersection(b)

    b_=b.intersection(c)

    c_=a.intersection(c)

    outliers_index=list(set(list(a_)+list(b_)+list(c_)))
    return outliers_index

### FUNCIÓN PARA HOMOLOGAR: TIPO DE RESIDENCIA

In [14]:
def residence(string):
        if string == 'O' or string == 'O ' or string == ' O' or string == 'o':
            return 'O'
        elif string == 'P' or string == 'p' or string == 'P ' or string == ' p' or string == ' p':
            return 'P'
        elif string == 'O' or string == 'O ' or string == 'o' or string == ' o':
            return 'O'
        elif string == 'C' or string == 'C ' or string == ' c' or string == 'c':
            return 'C'
        else:
            return 'NaN'

### FUNCIÓN PARA HOMOLOGAR: MARITAL STATUS

In [16]:
def marital(string):
        if string == 'S' or string == 'S ':
            return 'S'
        elif string == 'C' or string == 'C ':
            return 'C'
        elif string == 'O' or string == 'O ':
            return 'O'
        elif string == 'V' or string == 'V ':
            return 'V'
        elif string == 'D' or string == 'D ':
            return 'D'
        else:
            return 'NaN'