# Cleaning data
**Handling missing values, outliers, duplicates, data integrity and standardizing data**

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [None]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
run_save = True
imputing = True
encoding = True
standarding = True

In [None]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path, sep=";", index_col=0)

In [None]:
raw_df = df

raw_df.head()

In [None]:
numerical_variables = df.select_dtypes(include=['number'])
numerical_variables = numerical_variables.columns

categorical_variables = df.select_dtypes(exclude=['number'])
categorical_variables = categorical_variables.columns

print(f'Numerical: \n{numerical_variables}\n')
print(f'Categorical: \n{categorical_variables}')

In [None]:
# df['ID_LOJA'] = df['ID_LOJA'].astype(str)

In [None]:
missing_values = df.isnull().sum()

print("Columns with missing values:")
for column_name, num_missing in missing_values.items():
    if num_missing > 0:
        print(f"{column_name}\n\tNumber of missing values: {num_missing}")

# Categorical

In [None]:
# Categorical to numerical

if encoding:
    label_encoder = LabelEncoder()

    df[categorical_variables] = df[categorical_variables].apply(label_encoder.fit_transform)
    df[categorical_variables]

In [None]:
# Impute missing values

# Imputing under not MNAR assumption (Missing not at random)

if imputing:
    df = clean.random_sample_imputation(df)

In [None]:
# Standardizing numerical values

if standarding:
    scaler = StandardScaler()
    df[numerical_variables] = scaler.fit_transform(df[numerical_variables])
    df[numerical_variables]

In [None]:
# Exporting data

cleaned = df # Add final df

if run_save:
    utils.save_df(cleaned, "cleaned_standardized.csv")