# Cleaning data
**Handling missing values, outliers, duplicates, data integrity and standardizing data**

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [1]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

In [2]:
run_save = True
imputing = True
encoding = True
standarding = True
one_hot_encoding = True

In [3]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path, sep=";", index_col=0)

In [4]:
raw_df = df

raw_df.head()

Unnamed: 0_level_0,TIPO_RELACIONAMENTO,ESTADO_LOJA,CATEGORIA_PRODUTO,RENDA_ESPERADA,ESTRATEGIA_VENDAS,AVAL_SATISFACAO,VL_LIMITE_CREDITO_INICIAL_BANCO,REGIAO_LOJA,VL_FINANCEIRO_BCB,VL_OPERACAO_CARTAO_CREDITO,VL_LIMITE_CREDITO_BCB,PONTUACAO_DESEMPENHO,TEMPO_EXISTENCIA_ONLINE,AVALIACAO_RISCO,SCORE_INTERNO,RENDA
ID_LOJA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
922686,Tipo 3,RN,Alimentos e Bebidas,2199000,Nao informado,7.31,60000,NE,4265800,3546400,719400,792.0,37,300.0,528.0,242400
938287,Tipo 2,BA,Saude e Beleza,300000,Personalizada,3.57,50000,NE,168700,49500,500,392.0,22,300.0,,235100
939120,Tipo 3,CE,Alimentos e Bebidas,650000,Padrao,5.08,40000,NE,125900,90900,35000,653.0,27,300.0,,420600
933554,Tipo 2,SP,Saude e Beleza,300000,Nao informado,6.29,160000,SE,1417900,379200,18900,579.0,33,150.0,,340600
948294,Tipo 3,SP,Alimentos e Bebidas,1200000,Personalizada,9.23,60000,SE,402000,206800,195100,792.0,30,300.0,,456900


In [5]:
numerical_variables = df.select_dtypes(include=['number'])
numerical_variables = numerical_variables.columns

categorical_variables = df.select_dtypes(exclude=['number'])
categorical_variables = categorical_variables.columns

print(f'Numerical: \n{numerical_variables}\n')
print(f'Categorical: \n{categorical_variables}')

Numerical: 
Index(['RENDA_ESPERADA', 'AVAL_SATISFACAO', 'VL_LIMITE_CREDITO_INICIAL_BANCO',
       'VL_FINANCEIRO_BCB', 'VL_OPERACAO_CARTAO_CREDITO',
       'VL_LIMITE_CREDITO_BCB', 'PONTUACAO_DESEMPENHO',
       'TEMPO_EXISTENCIA_ONLINE', 'AVALIACAO_RISCO', 'SCORE_INTERNO', 'RENDA'],
      dtype='object')

Categorical: 
Index(['TIPO_RELACIONAMENTO', 'ESTADO_LOJA', 'CATEGORIA_PRODUTO',
       'ESTRATEGIA_VENDAS', 'REGIAO_LOJA'],
      dtype='object')


In [6]:
# df['ID_LOJA'] = df['ID_LOJA'].astype(str)

# Missing values

In [7]:
missing_values = df.isnull().sum()

print("Columns with missing values:")
for column_name, num_missing in missing_values.items():
    if num_missing > 0:
        print(f"{column_name}\n\tNumber of missing values: {num_missing}")

Columns with missing values:
AVAL_SATISFACAO
	Number of missing values: 10
PONTUACAO_DESEMPENHO
	Number of missing values: 29
AVALIACAO_RISCO
	Number of missing values: 314
SCORE_INTERNO
	Number of missing values: 23613


In [8]:
# Impute missing values

# Imputing under not MNAR assumption (Missing not at random)

if imputing:
    from sklearn.impute import SimpleImputer

    df = clean.random_sample_imputation(df)

# Categorical

In [9]:
# Categorical to numerical

if encoding:
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()

    df[categorical_variables] = df[categorical_variables].apply(label_encoder.fit_transform)
    df[categorical_variables]

In [10]:
if one_hot_encoding:
    df = pd.get_dummies(df, columns=categorical_variables, drop_first=True).astype('int')
    
    y = df.pop("RENDA")
    df.insert(len(df.columns), "RENDA", y)

In [11]:
# Standardizing numerical values

if standarding:

    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    df[numerical_variables] = scaler.fit_transform(df[numerical_variables])

df[numerical_variables]

Unnamed: 0_level_0,RENDA_ESPERADA,AVAL_SATISFACAO,VL_LIMITE_CREDITO_INICIAL_BANCO,VL_FINANCEIRO_BCB,VL_OPERACAO_CARTAO_CREDITO,VL_LIMITE_CREDITO_BCB,PONTUACAO_DESEMPENHO,TEMPO_EXISTENCIA_ONLINE,AVALIACAO_RISCO,SCORE_INTERNO,RENDA
ID_LOJA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
922686,0.300750,0.894620,-0.420669,0.362457,1.956469,0.360552,1.385296,0.126915,0.592832,-0.336093,-0.342069
938287,-0.028907,-1.764568,-0.556713,-0.422858,-0.384400,-0.362523,-1.186546,-1.210189,0.592832,1.410962,-0.370563
939120,0.031851,-0.434974,-0.692757,-0.431062,-0.356686,-0.327823,0.491581,-0.764488,0.592832,-0.980259,0.353482
933554,-0.028907,0.229823,0.939771,-0.183417,-0.163694,-0.344017,0.015790,-0.229646,-1.224170,0.366633,0.041225
948294,0.127328,2.224214,-0.420669,-0.378140,-0.279101,-0.166793,1.385296,-0.497067,0.592832,-0.560575,0.495168
...,...,...,...,...,...,...,...,...,...,...,...
964222,0.092609,0.229823,0.531639,-0.387149,-0.337072,-0.126762,1.655340,-1.388470,-1.224170,-0.648416,-0.252296
933678,-0.026269,-1.099771,1.483947,1.377092,2.635522,1.803485,-0.704325,-0.318786,-1.224170,-0.365374,-0.077042
912703,-0.011548,2.224214,-0.420669,-0.334821,-0.177150,-0.283769,-0.228535,-0.497067,-1.224170,1.206000,-0.214435
917068,-0.037587,0.894620,-0.284625,-0.427190,-0.356017,-0.308411,1.385296,0.572617,0.592832,-0.609376,0.187596


In [12]:
# Exporting data

cleaned = df # Add final df

if run_save:
    utils.save_df(cleaned, "cleaned.csv")

csv file saved on:  c:\Users\ptons\Code\repositories\datathon\data\featured.csv
