# Cleaning data
**Handling missing values, outliers, duplicates, data integrity and standardizing data**

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [23]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [24]:
run_save = True

instancing = True
encoding = True

In [25]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path, sep=";", index_col=0)

In [26]:
numerical_variables = df.select_dtypes(include=['number'])
numerical_variables = numerical_variables.columns

categorical_variables = df.select_dtypes(exclude=['number'])
categorical_variables = categorical_variables.columns

print(f'Numerical: \n{numerical_variables}\n')
print(f'Categorical: \n{categorical_variables}')

Numerical: 
Index(['RENDA_ESPERADA', 'AVAL_SATISFACAO', 'VL_LIMITE_CREDITO_INICIAL_BANCO',
       'VL_FINANCEIRO_BCB', 'VL_OPERACAO_CARTAO_CREDITO',
       'VL_LIMITE_CREDITO_BCB', 'PONTUACAO_DESEMPENHO',
       'TEMPO_EXISTENCIA_ONLINE', 'AVALIACAO_RISCO', 'SCORE_INTERNO', 'RENDA'],
      dtype='object')

Categorical: 
Index(['TIPO_RELACIONAMENTO', 'ESTADO_LOJA', 'CATEGORIA_PRODUTO',
       'ESTRATEGIA_VENDAS', 'REGIAO_LOJA'],
      dtype='object')


In [27]:
# df['ID_LOJA'] = df['ID_LOJA'].astype(str)

In [28]:
missing_values = df.isnull().sum()

print("Columns with missing values:")
for column_name, num_missing in missing_values.items():
    if num_missing > 0:
        print(f"{column_name}\n\tNumber of missing values: {num_missing}")

Columns with missing values:
AVAL_SATISFACAO
	Number of missing values: 10
PONTUACAO_DESEMPENHO
	Number of missing values: 29
AVALIACAO_RISCO
	Number of missing values: 314
SCORE_INTERNO
	Number of missing values: 23613


In [29]:
if instancing:
    numeric_imputer = SimpleImputer(strategy='median')
    df[numerical_variables] = numeric_imputer.fit_transform(df[numerical_variables])

In [30]:
if encoding:
    label_encoder = LabelEncoder()

    df[categorical_variables] = df[categorical_variables].apply(label_encoder.fit_transform)
    df[categorical_variables]

In [31]:
# Exporting data

cleaned = df # Add final df

if run_save:
    utils.save_df(cleaned, "cleaned.csv")

csv file saved on:  c:\Users\ptons\Code\repositories\datathon\data\cleaned.csv
