In [2]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

CPU times: user 280 ms, sys: 90.1 ms, total: 370 ms
Wall time: 639 ms


In [None]:
df_customers = pd.read_csv(config.raw_dir / "olist_customers_dataset.csv")
df_geolocation = pd.read_csv(config.raw_dir / "olist_geolocation_dataset.csv")


## df_customers

In [85]:
print(df_customers.shape)
display(df_customers.isnull().sum())
display(df_customers.head())

(99441, 5)


customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [12]:
print(f"df_cusotmers n_rows: {df_customers.shape[0]}")
print(df_customers["customer_id"].nunique() / df_customers.shape[0])
print(df_customers["customer_unique_id"].nunique() / df_customers.shape[0])

df_cusotmers n_rows: 99441
1.0
0.9663619633752677


In [15]:
df_customers.groupby("customer_unique_id")[
    "customer_id"
].nunique().reset_index().sort_values("customer_id", ascending=False).query(
    "customer_id >1"
)

Unnamed: 0,customer_unique_id,customer_id
52973,8d50f5eadf50201ccdcedfb9e2ac8455,17
23472,3e43e6105506432c953e165fb2acf44c,9
37797,6469f99c1f9dfae7733b25662e7f1782,7
76082,ca77025e7201e3b30c44b472ff346268,7
10354,1b6c7548a2a1f9037c1fd3ddfed95f33,7
...,...,...
35047,5cfc9643603c095fd7ed86a50fa17887,2
78003,cfb025f193db116549c9c23e4d58e3e3,2
36416,609b7feab0f0c9726929ec4891447b02,2
94630,fc19fe9ac733e36ff8227c29435c2030,2


In [76]:
df_customers.query("customer_unique_id == '8d50f5eadf50201ccdcedfb9e2ac8455'").head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
14186,1bd3585471932167ab72a84955ebefea,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
15321,a8fabc805e9a10a3c93ae5bff642b86b,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
16654,897b7f72042714efaa64ac306ba0cafc,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
36122,b2b13de0770e06de50080fea77c459e6,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP
38073,42dbc1ad9d560637c9c4c1533746f86d,8d50f5eadf50201ccdcedfb9e2ac8455,4045,sao paulo,SP


## df_geolocation

In [86]:
print(df_geolocation.shape)
display(df_geolocation.isnull().sum())
display(df_geolocation.head())

(1000163, 5)


geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [71]:
df_geolocation.groupby(
    [
        "geolocation_zip_code_prefix",
        "geolocation_lat",
        "geolocation_lng",
        # "geolocation_city",
    ]
)["geolocation_state"].nunique().reset_index().sort_values(
    "geolocation_state", ascending=False
).head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_state
306051,23056,-22.919164,-43.611097,2
40729,4011,-23.578707,-46.645779,2
0,1001,-23.551427,-46.634074,1
480106,45208,-13.865563,-40.069416,1
480099,45208,-13.867529,-40.076176,1


In [20]:
print(f"df_geolocation n_rows: {df_geolocation.shape[0]}")
print(df_geolocation["geolocation_zip_code_prefix"].nunique() / df_geolocation.shape[0])

df_geolocation n_rows: 1000163
0.0190119010601272


In [56]:
df_geolocation.query("geolocation_zip_code_prefix == 1037").head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
14,1037,-23.545187,-46.637855,são paulo,SP
31,1037,-23.546705,-46.640336,são paulo,SP
169,1037,-23.543883,-46.638075,são paulo,SP
178,1037,-23.546157,-46.639885,sao paulo,SP


In [72]:
df_geolocation_selected = df_geolocation[
    ["geolocation_zip_code_prefix", "geolocation_city", "geolocation_state"]
].drop_duplicates()
print(f"df_geolocation_selected n_rows: {df_geolocation_selected.shape[0]}")
print(
    df_geolocation_selected["geolocation_zip_code_prefix"].nunique()
    / df_geolocation_selected.shape[0]
)

df_geolocation_selected n_rows: 27912
0.6812482086557753


In [58]:
df_geolocation_selected.groupby("geolocation_zip_code_prefix")[
    "geolocation_city"
].nunique().reset_index().sort_values("geolocation_city", ascending=False).query(
    "geolocation_city >1"
).head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_city
5067,13457,5
3443,6900,5
5966,17970,5
7726,28950,5
5065,13455,5


In [35]:
df_geolocation_selected.query("geolocation_zip_code_prefix == 28950")

Unnamed: 0,geolocation_zip_code_prefix,geolocation_city,geolocation_state
521345,28950,armacao dos buzios,RJ
521373,28950,armação dos búzios,RJ
521381,28950,armacao de buzios,RJ
521505,28950,búzios,RJ
521684,28950,buzios,RJ


In [61]:
df_geolocation_selected_two_cols = df_geolocation[
    ["geolocation_zip_code_prefix", "geolocation_state"]
].drop_duplicates()
duplicate_zip_code_prefix_list = (
    df_geolocation_selected_two_cols.groupby("geolocation_zip_code_prefix")[
        "geolocation_state"
    ]
    .nunique()
    .reset_index()
    .sort_values("geolocation_state", ascending=False)
    .query("geolocation_state >1")
)["geolocation_zip_code_prefix"].tolist()
df_geolocation_selected_two_cols.query(
    "geolocation_zip_code_prefix in @duplicate_zip_code_prefix_list"
)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_state
21728,2116,SP
22261,2116,RN
71032,4011,SP
72852,4011,AC
430342,21550,RJ
430999,21550,AC
459234,23056,RJ
460406,23056,AC
792362,72915,GO
792394,72915,DF


Manually checking from [the website](https://en.youbianku.com/Brazil)

- 2116	SP → True
- 2116	RN
- 4011	SP → True
- 4011	AC
- 21550	RJ → True
- 21550	AC
- 23056	RJ → True
- 23056	AC
- 72915	GO
- 72915	DF → True
- 78557	MT → True
- 78557	RO
- 79750	MS → True
- 79750	RS
- 80630	PR → True
- 80630	SC

In [78]:
df_geolocation_selected_two_cols_unique = (
    df_geolocation_selected_two_cols.query(
        "not (geolocation_zip_code_prefix == 2116 and geolocation_state == 'RN')"
    )
    .query("not (geolocation_zip_code_prefix == 4011 and geolocation_state == 'AC')")
    .query("not (geolocation_zip_code_prefix == 21550 and geolocation_state == 'AC')")
    .query("not (geolocation_zip_code_prefix == 23056 and geolocation_state == 'AC')")
    .query("not (geolocation_zip_code_prefix == 72915 and geolocation_state == 'GO')")
    .query("not (geolocation_zip_code_prefix == 78557 and geolocation_state == 'RO')")
    .query("not (geolocation_zip_code_prefix == 79750 and geolocation_state == 'MS')")
    .query("not (geolocation_zip_code_prefix == 80630 and geolocation_state == 'SC')")
)

In [79]:
df_geolocation_selected_two_cols_unique.to_csv(
    config.interim_dir / "olist_geolocation_zip_code_prefix_state_unique.csv",
    index=False,
)
