In [118]:
import numpy as np

import pandas as pd
import geopandas as gp
import os

Importing geopandas

In [39]:
main_path = "../../_data/LIMITE_DISTRITAL_2020_INEI"
files = os.listdir( main_path )

for fl in files:
    
    extension = fl.split(".")[-1]
    
    # Rename files
    os.rename( f"{ main_path }/{fl}", f"{ main_path }/INEI_LIMITE_DISTRITAL.{extension}" )

district = gp.read_file( f"{main_path}/INEI_LIMITE_DISTRITAL.shp" )

In [46]:
district.geometry.crs

# Projected https://epsg.io/24891 area for Peru to get centroids
projected_district = district.to_crs( epsg = 24891 )

projected_district.geometry.crs

<Projected CRS: EPSG:24891>
Name: PSAD56 / Peru west zone
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: Peru - west of 79°W.
- bounds: (-81.41, -8.32, -79.0, -3.38)
Coordinate Operation:
- name: Peru west zone
- method: Transverse Mercator
Datum: Provisional South American Datum 1956
- Ellipsoid: International 1924
- Prime Meridian: Greenwich

In [50]:
projected_district[ 'centroids' ] = projected_district.centroid

projected_district[ 'Centroid_Latitude' ] = projected_district[ 'centroids' ].to_crs( epsg = 4326 ).y
projected_district[ 'Centroid_Longitude' ] = projected_district[ 'centroids' ].to_crs( epsg = 4326 ).x

ctr_data = projected_district.loc[ : , [ 'CCDD', 'NOMBDEP',  'CCPP',  'NOMBPROV',  'CCDI',
                                        'NOMBDIST', 'CAPITAL', 'UBIGEO', 'IDPROV', 'CODIGO',
                                        "Centroid_Latitude", "Centroid_Longitude" ] ]

In [52]:
ctr_data.to_excel( r'../../_data/peru_districts_centroids.xlsx', index = False )

Importing data

In [76]:
# 
df1  = pd.read_csv( r"C:\Users\Anzony\Documents\GitHub\Diplomado_PUCP\_data\salud_places_peru.csv")

Selecting department

In [101]:
df1.loc[ ( df1.latitud != 0 ), 'ind' ] = 1
df1.loc[ ( df1.latitud == 0 ), 'ind' ] = np.nan

In [105]:
df1.groupby("diresa")[[ 'ind']].apply(lambda x: x.notnull().mean()) \
        .sort_values( 'ind', ascending = False )

Unnamed: 0_level_0,ind
diresa,Unnamed: 1_level_1
PASCO,0.899135
AMAZONAS,0.876289
AYACUCHO,0.875421
HUANCAVELICA,0.875248
UCAYALI,0.839009
APURIMAC,0.820285
CAJAMARCA,0.819368
PUNO,0.785894
HUANUCO,0.76699
TUMBES,0.757353


We decide Cajamarca is the best option

In [113]:
df2 = df1[ df1.diresa == 'CAJAMARCA' ].copy()

From Dataframe to Geopandas

In [114]:
# Keep estb with lat and long
df3 = df2[ df2.latitud != 0 ].copy()

# Make df3 a geopandas
df3_geo = gp.GeoDataFrame( df3 , 
                           geometry= gp.points_from_xy( 
                                      df3.longitud, 
                                      df3.latitud))

# set crs
# https://gis.stackexchange.com/questions/327033/finding-the-right-crs-for-google-map-coordinates
df3_geo = df3_geo.set_crs( 'epsg:4326' )

Intersect Cajamarca geolocation with Geopandas to get UBIGEO

In [117]:
# Import shapefile
dist_shp = gp.read_file( f"{main_path}/INEI_LIMITE_DISTRITAL.shp" )

# Intersect with geo_data
intersected_df = gp.overlay( df3_geo , dist_shp , how = 'intersection')

# Drop unimportant columns
caj_loc = intersected_df.drop( ['DESCARGAR', 'WHATSAPP', 'CONTACTO', 'geometry'],  axis = 1 )

# Export
caj_loc.to_excel( r'../../_data/cajamarca_data_helth_estb.xlsx' , index = False )