<img src="https://raw.githubusercontent.com/rbizoi/PythonFormation/main/images/e-brasil.png" width="850">

# Les imports et initialisations des variables

In [1]:
from datetime import datetime
import pandas as pd, numpy as np, os, warnings, seaborn as sns, pickle, re, unicodedata
from datetime import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

font1 = fm.FontProperties(size=20)
font2 = fm.FontProperties(size=24)

%matplotlib inline

if int(str(sns.__version__).split('.')[1]) > 8 : 
    plt.style.use('seaborn-v0_8-darkgrid')
else:
    plt.style.use('seaborn-darkgrid')
    
sns.set(font_scale=2)
warnings.filterwarnings(action="ignore")

## Liste fichiers de données 

In [2]:
!dir ..\donnees\ebrasil

 Le volume dans le lecteur F s'appelle Data
 Le num‚ro de s‚rie du volume est D4F7-8E12

 R‚pertoire de F:\PythonFormation\donnees\ebrasil

14/02/2024  09:47    <DIR>          .
14/02/2024  09:47    <DIR>          ..
06/10/2019  20:27         9ÿ033ÿ957 olist_customers_dataset.csv
06/10/2019  20:27        61ÿ273ÿ883 olist_geolocation_dataset.csv
06/10/2019  20:27        17ÿ654ÿ914 olist_orders_dataset.csv
06/10/2019  20:27        15ÿ438ÿ671 olist_order_items_dataset.csv
06/10/2019  20:27         5ÿ777ÿ138 olist_order_payments_dataset.csv
06/10/2019  20:27        14ÿ409ÿ007 olist_order_reviews_dataset.csv
06/10/2019  20:27         2ÿ379ÿ446 olist_products_dataset.csv
28/09/2020  07:52           177ÿ799 olist_sellers_dataset.csv
17/01/2024  08:49             2ÿ542 product_category_name_translation.csv
               9 fichier(s)      126ÿ147ÿ357 octets
               2 R‚p(s)  86ÿ159ÿ405ÿ056 octets libres


## Changement de répertoire

In [3]:
os.chdir(r"../donnees")

## Fonctions utilitaires 

In [4]:
def nettoyer(valeur):
    nk = unicodedata.normalize('NFKD', valeur.lower())
    valeur = str(nk.encode('ASCII', 'ignore').decode('ASCII'))
    return re.compile('[^\w ]').sub('', valeur).strip().replace('  ',' ')

# DataFrame $items$

In [5]:
donnees = pd.read_csv(os.path.join('ebrasil', 'olist_geolocation_dataset.csv'))
donnees.rename(columns={col:col.replace('geolocation_','').replace('_prefix','') for col in donnees.columns},inplace=True)
dictEtats = {'AC':'Acre',              
             'AL':'Alagoas',
             'AP':'Amapá',
             'AM':'Amazonas',
             'BA':'Bahia',
             'CE':'Ceará',
             'ES':'Espírito Santo',
             'GO':'Goiás',
             'MA':'Maranhão',
             'MT':'Mato Grosso',
             'MS':'Mato Grosso do Sul', 
             'MG':'Minas Gerais',
             'PA':'Pará',
             'PB':'Paraïba',
             'PR':'Paraná',
             'PE':'Pernambouc',
             'PI':'Piauí',
             'RJ':'Rio de Janeiro',
             'RN':'Rio Grande do Norte',  
             'RS':'Rio Grande do Sul',
             'RO':'Rondônia',
             'RR':'Roraima',
             'SC':'Santa Catarina',
             'SP':'São Paulo',
             'SE':'Sergipe',
             'TO':'Tocantins',
             'DF':'District fédéral'} 
donnees['state'] = donnees['state'].apply(lambda x : dictEtats[x])
donnees['zip_code'] = donnees['zip_code'].astype('int32')

In [6]:
donnees.columns

Index(['zip_code', 'lat', 'lng', 'city', 'state'], dtype='object')

In [7]:
donnees.head()

Unnamed: 0,zip_code,lat,lng,city,state
0,1037,-23.545621,-46.639292,sao paulo,São Paulo
1,1046,-23.546081,-46.64482,sao paulo,São Paulo
2,1046,-23.546129,-46.642951,sao paulo,São Paulo
3,1041,-23.544392,-46.639499,sao paulo,São Paulo
4,1035,-23.541578,-46.641607,sao paulo,São Paulo


In [8]:
donnees.dtypes

zip_code      int32
lat         float64
lng         float64
city         object
state        object
dtype: object

In [9]:
pd.read_parquet('./ecommerce/customers.parquet').zip_code.nunique()

14994

In [10]:
pd.read_parquet('./ecommerce/sellers.parquet').zip_code.nunique()

2246

In [11]:
donnees.shape

(1000163, 5)

In [13]:
donnees.zip_code.nunique()

19015

In [12]:
donnees[['zip_code','city','state']].drop_duplicates().shape

(27912, 3)

In [14]:
donnees.city = donnees.city.apply(nettoyer)

In [15]:
donnees[['zip_code','city','state']].drop_duplicates().shape

(19581, 3)

In [11]:
donnees.zip_code.nunique()

19015

In [34]:
donnees[['zip_code','city','state']].drop_duplicates().shape

(27912, 3)

In [16]:
geo01 = donnees.groupby(['zip_code']).agg({'lat':['min','max','mean'],
                                           'lng':['min','max','mean']}).reset_index()
geo01.columns = [geo01.columns[0][0]]+[ col[0]+'_'+col[1] for col in geo01.columns[1:] ]

In [17]:
geo01.head()

Unnamed: 0,zip_code,lat_min,lat_max,lat_mean,lng_min,lng_max,lng_mean
0,1001,-23.551427,-23.549292,-23.55019,-46.63441,-46.633559,-46.634024
1,1002,-23.548878,-23.544641,-23.548146,-46.636361,-46.63318,-46.634979
2,1003,-23.549083,-23.548901,-23.548994,-46.637157,-46.634862,-46.635731
3,1004,-23.550765,-23.549181,-23.549799,-46.635371,-46.634057,-46.634757
4,1005,-23.54998,-23.548758,-23.549456,-46.638411,-46.634768,-46.636733


In [18]:
geo01.zip_code.nunique()

19015

In [20]:
donnees.assign( rn = donnees.sort_values(['city','lat','lng']
                                                ).groupby(['zip_code']).cumcount()+1
                      ).sort_values(['zip_code','rn'])

Unnamed: 0,zip_code,lat,lng,city,state,rn
326,1001,-23.551427,-46.634074,sao paulo,São Paulo,1
519,1001,-23.551337,-46.634027,sao paulo,São Paulo,2
583,1001,-23.551337,-46.634027,sao paulo,São Paulo,3
818,1001,-23.551337,-46.634027,sao paulo,São Paulo,4
235,1001,-23.550642,-46.634410,sao paulo,São Paulo,5
...,...,...,...,...,...,...
999972,99980,-28.386612,-51.846889,david canabarro,Rio Grande do Sul,24
999897,99980,-28.386408,-51.844876,david canabarro,Rio Grande do Sul,25
999764,99980,-28.386239,-51.847741,david canabarro,Rio Grande do Sul,26
999758,99990,-28.329718,-51.769615,muliterno,Rio Grande do Sul,1


In [21]:
geo02 = donnees.assign( rn = donnees.sort_values(['city','lat','lng']
                                                ).groupby(['zip_code']).cumcount()+1
                      ).query('rn < 2').sort_values(['zip_code','rn']).drop(columns=['rn','lat','lng'])

In [22]:
geo02.head()

Unnamed: 0,zip_code,city,state
326,1001,sao paulo,São Paulo
720,1002,sao paulo,São Paulo
478,1003,sao paulo,São Paulo
443,1004,sao paulo,São Paulo
584,1005,sao paulo,São Paulo


In [23]:
geo02.zip_code.nunique()

19015

In [24]:
donnees = geo02.merge(geo01,on='zip_code')

In [25]:
donnees.columns = ['zip_code', 'city', 'state', 'lat_min', 'lat_max', 'lat', 'lng_min', 'lng_max', 'lng']

In [26]:
donnees.head()

Unnamed: 0,zip_code,city,state,lat_min,lat_max,lat,lng_min,lng_max,lng
0,1001,sao paulo,São Paulo,-23.551427,-23.549292,-23.55019,-46.63441,-46.633559,-46.634024
1,1002,sao paulo,São Paulo,-23.548878,-23.544641,-23.548146,-46.636361,-46.63318,-46.634979
2,1003,sao paulo,São Paulo,-23.549083,-23.548901,-23.548994,-46.637157,-46.634862,-46.635731
3,1004,sao paulo,São Paulo,-23.550765,-23.549181,-23.549799,-46.635371,-46.634057,-46.634757
4,1005,sao paulo,São Paulo,-23.54998,-23.548758,-23.549456,-46.638411,-46.634768,-46.636733


## Sauvegarde en parquet

In [22]:
donnees.to_parquet('./ecommerce/geolocation.parquet',compression='gzip', engine='pyarrow')

In [23]:
pd.read_parquet('./ecommerce/geolocation.parquet').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19015 entries, 0 to 19014
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   zip_code  19015 non-null  int32  
 1   city      19015 non-null  object 
 2   state     19015 non-null  object 
 3   lat_min   19015 non-null  float64
 4   lat_max   19015 non-null  float64
 5   lat       19015 non-null  float64
 6   lng_min   19015 non-null  float64
 7   lng_max   19015 non-null  float64
 8   lng       19015 non-null  float64
dtypes: float64(6), int32(1), object(2)
memory usage: 1.2+ MB


In [24]:
!dir ecommerce

 Le volume dans le lecteur F s'appelle Data
 Le num‚ro de s‚rie du volume est D4F7-8E12

 R‚pertoire de F:\PythonFormation\donnees\ecommerce

14/02/2024  15:37    <DIR>          .
14/02/2024  15:37    <DIR>          ..
14/02/2024  15:45         4ÿ525ÿ883 customers.parquet
14/02/2024  15:48         1ÿ122ÿ602 geolocation.parquet
14/02/2024  15:46         4ÿ425ÿ246 items.parquet
14/02/2024  15:46         4ÿ527ÿ569 items_products.parquet
14/02/2024  15:45        27ÿ443ÿ040 orders.csv
14/02/2024  15:45         9ÿ269ÿ678 orders.parquet
14/02/2024  15:46         9ÿ725ÿ278 orders_payments.parquet
14/02/2024  15:47        11ÿ010ÿ093 orders_payments_reviews_p.parquet
14/02/2024  15:46         2ÿ375ÿ117 payments.parquet
14/02/2024  15:46           963ÿ000 products.parquet
14/02/2024  15:47         7ÿ858ÿ269 reviews.parquet
14/02/2024  15:47         3ÿ201ÿ554 reviews_p.parquet
14/02/2024  15:06            91ÿ808 sellers.parquet
              13 fichier(s)       86ÿ539ÿ137 octets
               2 R