## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import os

## Import Data

In [4]:
#path to main folder
path = r'C:\Users\steve\Documents\Olist Marketplace Analysis'

# import geolocation file
geo_df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_geolocation_dataset.csv'), index_col = False)

## Analysis

#### 01. Content + Shape

In [7]:
geo_df.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [8]:
geo_df.tail()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS
1000162,99950,-28.070104,-52.018658,tapejara,RS


In [9]:
geo_df.shape

(1000163, 5)

In [10]:
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [11]:
geo_df.describe()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
count,1000163.0,1000163.0,1000163.0
mean,36574.17,-21.17615,-46.39054
std,30549.34,5.715866,4.269748
min,1001.0,-36.60537,-101.4668
25%,11075.0,-23.60355,-48.57317
50%,26530.0,-22.91938,-46.63788
75%,63504.0,-19.97962,-43.76771
max,99990.0,45.06593,121.1054


#### 02. Value Counts

Zipcode

In [14]:
# get counts of zipcodes
geo_df['geolocation_zip_code_prefix'].value_counts(dropna = False)

geolocation_zip_code_prefix
24220    1146
24230    1102
38400     965
35500     907
11680     879
         ... 
71750       1
71742       1
26475       1
26357       1
29826       1
Name: count, Length: 19015, dtype: int64

**Unique Zipcodes** = 19,015 *(compare to customer's 14,994 unique zipcodes)*

City

In [17]:
# get counts of cities
geo_df['geolocation_city'].value_counts(dropna = False)

geolocation_city
sao paulo               135800
rio de janeiro           62151
belo horizonte           27805
são paulo                24918
curitiba                 16593
                         ...  
jacuípe                      1
mar vermelho                 1
quebrangulo                  1
poço das trincheiras         1
poxim                        1
Name: count, Length: 8011, dtype: int64

**Unique Cities** = 8,011 *(Compare to customer's 4,119 unique cities)*

State

In [20]:
# get counts of states
geo_df['geolocation_state'].value_counts(dropna = False)

geolocation_state
SP    404268
MG    126336
RJ    121169
RS     61851
PR     57859
SC     38328
BA     36045
GO     20139
ES     16748
PE     16432
DF     12986
MT     12031
CE     11674
PA     10853
MS     10431
MA      7853
PB      5538
RN      5041
PI      4549
AL      4183
TO      3576
SE      3563
RO      3478
AM      2432
AC      1301
AP       853
RR       646
Name: count, dtype: int64

**Unique States** = 27 *(same as customer's unique states)*

### CONSISTENCY CHECKS

#### 01. Mixed-Type Data

In [24]:
# check if there are any mixed-type columns
for col in geo_df.columns.tolist():
    weird = (geo_df[[col]].map(type) != geo_df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (geo_df[weird]) > 0:
        print(col)
    else:
        print('not mixed col')

not mixed col
not mixed col
not mixed col
not mixed col
not mixed col


**No mixed columns**

#### 02. Missing Values

In [27]:
# returns number of missing data by column
geo_df.isnull().sum()

geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

**No missing values**

#### 03. Duplicates

In [30]:
# creates a subet containing only the full duplicates
df_dups = geo_df[geo_df.duplicated()]
df_dups

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
15,1046,-23.546081,-46.644820,sao paulo,SP
44,1046,-23.546081,-46.644820,sao paulo,SP
65,1046,-23.546081,-46.644820,sao paulo,SP
66,1009,-23.546935,-46.636588,sao paulo,SP
67,1046,-23.546081,-46.644820,sao paulo,SP
...,...,...,...,...,...
1000153,99970,-28.343273,-51.873734,ciriaco,RS
1000154,99950,-28.070493,-52.011342,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS


**Full Duplicates** = 261,831

In [32]:
df_sorted_by_zipcode = geo_df.sort_values(by = 'geolocation_zip_code_prefix', ascending = True)
df_sorted_by_zipcode.head(10)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
1246,1001,-23.549292,-46.633559,sao paulo,SP
429,1001,-23.550498,-46.634338,sao paulo,SP
1182,1001,-23.549779,-46.633957,sao paulo,SP
1435,1001,-23.549292,-46.633559,sao paulo,SP
326,1001,-23.551427,-46.634074,sao paulo,SP
1004,1001,-23.549292,-46.633559,sao paulo,SP
771,1001,-23.550498,-46.634338,sao paulo,SP
235,1001,-23.550642,-46.63441,sao paulo,SP
1062,1001,-23.550498,-46.634338,sao paulo,SP
897,1001,-23.549292,-46.633559,sao paulo,SP


**Note:** Lat and Lng coordinates are calculated with extreme precision for all possibilities of each zip code prefix. This has lead to full duplicates. Full duplicates will be removed.

In [34]:
geo_df_no_dups = geo_df.drop_duplicates()
geo_df_no_dups.shape
# should be 1,000,163 - 261,831

(738332, 5)

**Note:** If Lat and Lng coordinates are needed, data will need to be grouped by zipcode and lat and lng median calculated to find the central coordinate.

### Export Data

In [37]:
geo_df_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data','geolocation_checked.csv'))