# Cleaning data
**Handling missing values, outliers, duplicates, data integrity and standardizing data**

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [23]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [24]:
run_save = True
instancing = True
encoding = True
standarding = False

In [25]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path, sep=";", index_col=0)

In [26]:
raw_df = df

raw_df.head()

Unnamed: 0_level_0,TIPO_RELACIONAMENTO,ESTADO_LOJA,CATEGORIA_PRODUTO,RENDA_ESPERADA,ESTRATEGIA_VENDAS,AVAL_SATISFACAO,VL_LIMITE_CREDITO_INICIAL_BANCO,REGIAO_LOJA,VL_FINANCEIRO_BCB,VL_OPERACAO_CARTAO_CREDITO,VL_LIMITE_CREDITO_BCB,PONTUACAO_DESEMPENHO,TEMPO_EXISTENCIA_ONLINE,AVALIACAO_RISCO,SCORE_INTERNO,RENDA
ID_LOJA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
922686,Tipo 3,RN,Alimentos e Bebidas,2199000,Nao informado,7.31,60000,NE,4265800,3546400,719400,792.0,37,300.0,528.0,242400
938287,Tipo 2,BA,Saude e Beleza,300000,Personalizada,3.57,50000,NE,168700,49500,500,392.0,22,300.0,,235100
939120,Tipo 3,CE,Alimentos e Bebidas,650000,Padrao,5.08,40000,NE,125900,90900,35000,653.0,27,300.0,,420600
933554,Tipo 2,SP,Saude e Beleza,300000,Nao informado,6.29,160000,SE,1417900,379200,18900,579.0,33,150.0,,340600
948294,Tipo 3,SP,Alimentos e Bebidas,1200000,Personalizada,9.23,60000,SE,402000,206800,195100,792.0,30,300.0,,456900


In [27]:
numerical_variables = df.select_dtypes(include=['number'])
numerical_variables = numerical_variables.columns

categorical_variables = df.select_dtypes(exclude=['number'])
categorical_variables = categorical_variables.columns

print(f'Numerical: \n{numerical_variables}\n')
print(f'Categorical: \n{categorical_variables}')

Numerical: 
Index(['RENDA_ESPERADA', 'AVAL_SATISFACAO', 'VL_LIMITE_CREDITO_INICIAL_BANCO',
       'VL_FINANCEIRO_BCB', 'VL_OPERACAO_CARTAO_CREDITO',
       'VL_LIMITE_CREDITO_BCB', 'PONTUACAO_DESEMPENHO',
       'TEMPO_EXISTENCIA_ONLINE', 'AVALIACAO_RISCO', 'SCORE_INTERNO', 'RENDA'],
      dtype='object')

Categorical: 
Index(['TIPO_RELACIONAMENTO', 'ESTADO_LOJA', 'CATEGORIA_PRODUTO',
       'ESTRATEGIA_VENDAS', 'REGIAO_LOJA'],
      dtype='object')


In [28]:
# df['ID_LOJA'] = df['ID_LOJA'].astype(str)

In [29]:
missing_values = df.isnull().sum()

print("Columns with missing values:")
for column_name, num_missing in missing_values.items():
    if num_missing > 0:
        print(f"{column_name}\n\tNumber of missing values: {num_missing}")

Columns with missing values:
AVAL_SATISFACAO
	Number of missing values: 10
PONTUACAO_DESEMPENHO
	Number of missing values: 29
AVALIACAO_RISCO
	Number of missing values: 314
SCORE_INTERNO
	Number of missing values: 23613


# Categorical

In [30]:
if instancing:
    numeric_imputer = SimpleImputer(strategy='median')
    df[numerical_variables] = numeric_imputer.fit_transform(df[numerical_variables])

In [31]:
if encoding:
    label_encoder = LabelEncoder()

    df[categorical_variables] = df[categorical_variables].apply(label_encoder.fit_transform)
    df[categorical_variables]

In [32]:
if standarding:
    scaler = StandardScaler()
    df[numerical_variables] = scaler.fit_transform(df[numerical_variables])
    df[numerical_variables]

In [33]:
# Exporting data

cleaned = df # Add final df

if run_save:
    utils.save_df(cleaned, "cleaned.csv")

csv file saved on:  c:\Users\ptons\Code\repositories\datathon\data\cleaned.csv
