# Exploratory Data Analysis

## Lib Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from rapidfuzz import fuzz, process
import unidecode


## Get raw data

In [None]:
df_customers = pd.read_csv("../data/raw/olist_customers_dataset.csv")
df_order_items = pd.read_csv("../data/raw/olist_order_items_dataset.csv")
df_order_payments = pd.read_csv("../data/raw/olist_order_payments_dataset.csv")
df_orders = pd.read_csv("../data/raw/olist_orders_dataset.csv")
df_products = pd.read_csv("../data/raw/olist_products_dataset.csv")
df_sellers = pd.read_csv("../data/raw/olist_sellers_dataset.csv")
cdf_cities = pd.read_csv("../data/raw/brazilian_cities.csv")

## Pre Processing

### Null inspection

In [None]:
df_order_items.isnull().sum()
df_order_payments.isnull().sum()
df_orders.isnull().sum() # Has null value
df_products.isnull().sum() # Has null value
df_sellers.isnull().sum()
df_customers.isnull().sum()

The column 'product_category_name' is a very relevant data, therefore all null values will be fill with a standard value.

In [None]:
df_products['product_category_name'] = df_products['product_category_name'].fillna('sem categoria')

The other metrics not will be necessary to answer our KPIs, therefore they will be entirely discarted. 

In [24]:
cols_to_drop = [
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty',
    'product_weight_g',
    'product_length_cm',
    'product_height_cm',
    'product_width_cm'
]

df_products.drop(columns=cols_to_drop, inplace=True)

Also, dropped the order column the logistics data from order dataframe.

In [28]:
cols_to_drop = [
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

df_orders.drop(columns=cols_to_drop, inplace=True)

## Duplicated Data

In [None]:
df_order_items.duplicated().sum()
df_order_payments.duplicated().sum()
df_orders.duplicated().sum()
df_products.duplicated().sum()
df_sellers.duplicated().sum()


np.int64(0)

There's no duplicated data in any of the dataframes.

## Checking for the relevant data

Checking for columns to drop in df_order_items

In [36]:
cols_to_drop = [
    'shipping_limit_date',
    'freight_value',
]

df_order_items.drop(columns=cols_to_drop, inplace=True)

In [205]:
cols_to_drop = [
    'payment_sequential',
    'payment_installments',
]

df_order_payments.drop(columns=cols_to_drop, inplace=True)

## Data Clean up and Normalization

### 1. Dataframe: df_order

In [None]:
df_orders['order_purchase_timestamp'] = pd.to_datetime(df_orders['order_purchase_timestamp'])
df_orders['order_status'] = df_orders['order_status'].astype('category')

In [196]:
df_orders['year'] = df_orders['order_purchase_timestamp'].dt.year
df_orders['month'] = df_orders['order_purchase_timestamp'].dt.month
df_orders['day'] = df_orders['order_purchase_timestamp'].dt.day

In [199]:
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  99441 non-null  object        
 1   customer_id               99441 non-null  object        
 2   order_status              99441 non-null  category      
 3   order_purchase_timestamp  99441 non-null  datetime64[ns]
 4   year                      99441 non-null  int32         
 5   month                     99441 non-null  int32         
 6   day                       99441 non-null  int32         
dtypes: category(1), datetime64[ns](1), int32(3), object(2)
memory usage: 3.5+ MB


Filtering orders that's not delivered

In [203]:
df_orders = df_orders[df_orders['order_status'] == 'delivered']

### 2. Dataframe: df_order_payments

In [191]:
df_order_payments['order_id'] = df_order_payments['order_id'].astype(str)
df_order_payments['payment_type'] = df_order_payments['payment_type'].astype(str)

In [207]:
df_order_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       103886 non-null  object 
 1   payment_type   103886 non-null  object 
 2   payment_value  103886 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.4+ MB


### 3. Dataframe: df_products 

In [189]:
df_products['product_id'] = df_products['product_id'].astype(str)
df_products['product_category_name'] = df_products['product_category_name'].astype(str)

In [190]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32951 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


### 4. Dataframe: df_sellers

In the city column, there's some strange entries (such as the "name of the city / state") or special characteres

In [None]:
df_sellers.loc[df_sellers['seller_city'].str.contains(r'[^a-zA-ZÀ-ÿ\s]', na=False), 'seller_city']

78                                    lages - sc
237                                 auriflama/sp
246                        sao paulo / sao paulo
360                                   são paulo
476                        santa barbara d´oeste
517                                     04482255
551     novo hamburgo, rio grande do sul, brasil
622                               cariacica / es
707                           sao miguel d'oeste
826                               sao paulo - sp
869                                       sbc/sp
874               arraial d'ajuda (porto seguro)
945                        santo andre/sao paulo
1004                                     sp / sp
1159                              maua/sao paulo
1337                        mogi das cruzes / sp
1346              rio de janeiro \rio de janeiro
1447                     barbacena/ minas gerais
1580                              sao paulo - sp
1610                                   andira-pr
1649             rio

Considering the wildcard nature of this entries, i will make a more detailed normalization.

In [81]:
df_sellers['seller_city'] = df_sellers['seller_city'].str.split(r'[-/]').str[0]
df_sellers['seller_city'] = df_sellers['seller_city'].str.strip()


In [None]:
city_exceptions = [
    "santa barbara d´oeste",
    "sao miguel d'oeste",
    "santa barbara d'oeste",
]
city_mask = (
    ~df_sellers['seller_city'].isin(city_exceptions) &  # não é exceção
    df_sellers['seller_city'].str.contains(r"[^a-zA-ZÀ-ÿ\s]", na=False)  # contém caractere inválido
)


df_sellers.loc[city_mask, 'seller_city']

517                                     04482255
551     novo hamburgo, rio grande do sul, brasil
874               arraial d'ajuda (porto seguro)
1346              rio de janeiro \rio de janeiro
2258                   vendas@creditparts.com.br
2988      rio de janeiro, rio de janeiro, brasil
Name: seller_city, dtype: object

I gave exception for cities that has special characteres in their names and now the wildcard was reduced to seven entries. Considering its just a few entries left, i will manually set those.

In [113]:
df_sellers.loc[360,'seller_city'] = 'sao paulo'
df_sellers.loc[517,'seller_city'] = 'recreio dos bandeirantes'
df_sellers.loc[551,'seller_city'] = 'novo hamburgo'
df_sellers.loc[874,'seller_city'] = 'porto seguro'
df_sellers.loc[1346,'seller_city'] = 'rio de janeiro'
df_sellers.loc[2258,'seller_city'] = 'maringa'
df_sellers.loc[2988,'seller_city'] = 'rio de janeiro'

df_sellers.loc[city_mask, 'seller_city']



517     recreio dos bandeirantes
551                novo hamburgo
874                 porto seguro
1346              rio de janeiro
2258                     maringa
2988              rio de janeiro
Name: seller_city, dtype: object

Normalizing data for better comparison

In [123]:
df_sellers['city_norm'] = (
    df_sellers['seller_city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

cdf_cities['city_norm'] = (
    cdf_cities['city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

Changing the exact matches of cities

In [None]:
city_dict = dict(zip(cdf_cities['city_norm'], cdf_cities['city']))

df_sellers['seller_city_clean'] = df_sellers['city_norm'].map(city_dict)


Searching for matches with fuzz for misspells (90%) <br />
Obs: Picarras is a very tricky one theres two cities with almost same name, this entry need to be checked with Postal Code

In [None]:
missing = df_sellers[df_sellers['seller_city_clean'].isna()]['seller_city'].unique()

for city in missing:
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 90 and city != 'picarras':
        city_dict[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]

df_sellers['seller_city_clean'] = df_sellers['city_norm'].map(city_dict)


Checking the entries that failed on fuzz

In [180]:
df_sellers[df_sellers['seller_city_clean'].isna()]


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,city_norm,seller_city_clean
2122,a5259c149128e82c9d6d46e0c1c812bb,76900,ji parana,RO,ji parana,
2183,d5c530f4884a75ae0dba9c148718d278,35660,centro,MG,centro,
2380,1703bc09972dab9782e7a9194943b69f,37165,minas gerais,MG,minas gerais,
2573,8f2ce03f928b567e3d56181ae20ae952,5141,pirituba,SP,pirituba,
2589,3a52d63a8f9daf5a28f3626d7eb9bd28,71900,aguas claras df,SP,aguas claras df,
2662,3c487ae8f8d7542beff5788e2e0aea83,72460,gama,DF,gama,


In [None]:

df_sellers.loc[df_sellers['city_norm'] == 'sp','seller_city_clean'] = 'São Paulo'
df_sellers.loc[1985,'seller_city_clean'] = 'São Paulo'
df_sellers.loc[df_sellers['city_norm'] == 'sbc','seller_city_clean'] = 'São Bernardo do Campo'
df_sellers.loc[df_sellers['city_norm'] == 'vicente de carvalho','seller_city_clean'] = 'Guarujá'
df_sellers.loc[405,'seller_city_clean'] = '	Paulo Afonso'
df_sellers.loc[2918,'seller_city_clean'] = 'Balneário Piçarras'
df_sellers.loc[517,'seller_city_clean'] = 'Rio de Janeiro'
df_sellers.loc[1050,'seller_city_clean'] = 'Palhoça'
df_sellers.loc[1755,'seller_city_clean'] = 'Teófilo Otoni'
df_sellers.loc[2006,'seller_city_clean'] = 'Embu-Guaçu'
df_sellers.loc[2122,'seller_city_clean'] = 'Ji-Paraná'
df_sellers.loc[2183,'seller_city_clean'] = 'Pará de Minas'
df_sellers.loc[2380,'seller_city_clean'] = 'Campo do Meio'
df_sellers.loc[2573,'seller_city_clean'] = 'São Paulo'
df_sellers.loc[2589,'seller_city_clean'] = 'Brasília'
df_sellers.loc[2589,'seller_state'] = 'DF' #The state was wrong
df_sellers.loc[2662,'seller_city_clean'] = 'Brasília'

df_sellers[df_sellers['seller_city_clean'].isna()]


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,city_norm,seller_city_clean


Finally, dropping the helper columns

In [None]:
df_sellers['seller_city'] = df_sellers['seller_city_clean']
df_sellers.drop(columns=['seller_city_clean', 'city_norm'], inplace=True)

In [None]:
df_sellers['seller_id'] = df_sellers['seller_id'].astype('category')

In [186]:
city_raw_list = sorted(df_sellers['seller_city'].unique())
df_sellers['seller_id'] = df_sellers['seller_id'].astype(str)
df_sellers['seller_city'] = df_sellers['seller_id'].astype(str)
df_sellers['seller_state'] = df_sellers['seller_id'].astype(str)

In [188]:
df_sellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


### 5. Dataframe: df_order_items

In [195]:
df_order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  int64  
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.3+ MB


### 6. Dataframe: df_customers

In [211]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
 5   city_norm                 99441 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.6+ MB


Applying the normalization to the customer city in order to validade this column

In [222]:
df_customers['city_norm'] = (
    df_customers['customer_city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

cdf_cities['city_norm'] = (
    cdf_cities['city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

Checking for exact matches

In [223]:
city_dict = dict(zip(cdf_cities['city_norm'], cdf_cities['city']))

df_customers['customer_city_clean'] = df_customers['city_norm'].map(city_dict)

Applying fuzz 95%

In [229]:
missing = df_customers[df_customers['customer_city_clean'].isna()]['customer_city'].unique()
corrections = {}
for city in missing:
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 95 and city != 'alexandrita':
        city_dict[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]

df_customers['customer_city_clean'] = df_customers['city_norm'].map(city_dict)
corrections

{'nossa senhora do remedio': 'Nossa Senhora dos Remédios',
 'sao thome das letras': 'São Tomé das Letras',
 'santana do livramento': "Sant'Ana do Livramento",
 'palmeirinha': 'Palmeirina',
 'santa rita do ibitipoca': 'Santa Rita de Ibitipoca',
 'sao luis do paraitinga': 'São Luiz do Paraitinga',
 'santa barbara d oeste': "Santa Bárbara d'Oeste",
 'graccho cardoso': 'Gracho Cardoso',
 "olhos d'agua": "Olho d'Água"}

In [None]:
df_customers_recover = pd.read_csv("../data/raw/olist_customers_dataset.csv")
df_customers.loc['customer_city'] 

In [233]:
missing = df_customers[df_customers['customer_city_clean'].isna()]['customer_city'].unique()
corrections = {}
for city in missing:
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 92:
        #city_dict[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]

#df_customers['customer_city_clean'] = df_customers['city_norm'].map(city_dict)
corrections

{'piumhii': 'Piumhi',
 'santa isabel do para': 'Santa Izabel do Pará',
 'portela': 'Portel',
 'caraiba': 'Caraíbas',
 'jacare': 'Jacareí',
 'araguaia': 'Araguaína',
 'raposo': 'Raposos',
 'biritiba-mirim': 'Biritiba Mirim',
 'queixada': 'Quixadá',
 'couto de magalhaes': 'Couto Magalhães',
 'holambra ii': 'Holambra',
 'monte verde': 'Nova Monte Verde',
 'palmeira d oeste': "Palmeira d'Oeste",
 'alexandrita': 'Alexandria',
 'estrela d oeste': "Estrela d'Oeste",
 'botelho': 'Botelhos',
 'pindare mirim': 'Pindaré-Mirim',
 'amanari': 'Manari',
 'alexandra': 'Alexandria',
 'amparo da serra': 'Amparo do Serra',
 'poco de pedra': 'Poção de Pedras',
 'ibitira': 'Ibitiara',
 'picarras': 'Piçarra',
 'vitorinos': 'Vitorino'}