# Cleaning Covid data for Mapbox: municipal

**Background**: We use Covid-19 cases data in the Philippines from data from the health department and shapefile processed through geopandas to create an interactive map. 

**Tools**: pandas, geopandas, Mapbox

Updated as of May 6, 2023

# Do your imports

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandas as pd
import fuzzy_pandas as fpd

import shapely

pd.set_option('display.max_columns', None)

# Read your CSV

In [2]:
df= pd.read_csv('../data/municipalities.csv')
df

Unnamed: 0,ProvRes,CityMunRes,CityMunRes.1
0,Abra,Bangued (Capital),1231
1,Abra,Boliney,199
2,Abra,Bucay,350
3,Abra,Bucloc,120
4,Abra,Daguioman,92
...,...,...,...
1631,Zamboanga Sibugay,Roseller Lim,198
1632,Zamboanga Sibugay,Siay,456
1633,Zamboanga Sibugay,Talusan,58
1634,Zamboanga Sibugay,Titay,491


# Cleaning the data

## Lowercase column headers

In [3]:
df.columns = df.columns.str.lower()
df.head(10)

Unnamed: 0,provres,citymunres,citymunres.1
0,Abra,Bangued (Capital),1231
1,Abra,Boliney,199
2,Abra,Bucay,350
3,Abra,Bucloc,120
4,Abra,Daguioman,92
5,Abra,Danglas,132
6,Abra,Dolores,419
7,Abra,La Paz,313
8,Abra,Lacub,43
9,Abra,Lagangilang,314


## Change names

In [4]:
df.citymunres = df.citymunres.str.replace('(Capital)', "", regex=False)
df.citymunres = df.citymunres.str.replace('Of', "of", regex=False)
df.provres = df.provres.str.replace('Samar (Western Samar)', 'Samar', regex=False)
df.provres = df.provres.str.replace('Cotabato (North Cotabato)', 'Cotabato', regex=False)
df.provres = df.provres.str.replace('Ncr', 'NCR', regex=False)
df.provres = df.provres.str.replace('Cotabato City (Not A Province)', 'Cotabato City', regex=False)
df.provres = df.provres.str.replace('City Of Isabela (Not A Province)', 'City of Isabela', regex=False)
df.citymunres = df.citymunres.str.replace(r'[(].*$', "", regex=True)

## Look for NaN values

In [5]:
df.isna().sum()

provres         0
citymunres      0
citymunres.1    0
dtype: int64

## Rename columns

This is again to match the shapefile column containing the provinces' names which is 'adm2_en'. We are also renaming the column containing the Covid-19 cases tally.

In [6]:
df= df.rename(columns={"provres": "province"})
df= df.rename(columns={"citymunres": "municipality"})
df= df.rename(columns={"citymunres.1": "covid_cases"})
df.head()

Unnamed: 0,province,municipality,covid_cases
0,Abra,Bangued,1231
1,Abra,Boliney,199
2,Abra,Bucay,350
3,Abra,Bucloc,120
4,Abra,Daguioman,92


## Merge province and municipality columns

Make a new column containing both

In [7]:
df['territory'] = df.municipality + df.province
df

Unnamed: 0,province,municipality,covid_cases,territory
0,Abra,Bangued,1231,Bangued Abra
1,Abra,Boliney,199,BolineyAbra
2,Abra,Bucay,350,BucayAbra
3,Abra,Bucloc,120,BuclocAbra
4,Abra,Daguioman,92,DaguiomanAbra
...,...,...,...,...
1631,Zamboanga Sibugay,Roseller Lim,198,Roseller LimZamboanga Sibugay
1632,Zamboanga Sibugay,Siay,456,SiayZamboanga Sibugay
1633,Zamboanga Sibugay,Talusan,58,TalusanZamboanga Sibugay
1634,Zamboanga Sibugay,Titay,491,TitayZamboanga Sibugay


# Geopandas

## Read through file

In [8]:
municipal = gpd.read_file('phl_adminboundaries_candidate_adm3.gdb.zip')
municipal.head()

Unnamed: 0,Shape_Length,Shape_Area,admin3Name_en,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin2Name_en,admin2Pcode,admin1Name_en,admin1Pcode,admin0Name_en,admin0Pcode,date,validOn,validTo,geometry
0,1.601219,0.063496,Aborlan,PH175301000,,,,Palawan,PH175300000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30 00:00:00+00:00,2020-05-29 00:00:00+00:00,NaT,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,1.078749,0.050232,Abra de Ilog,PH175101000,,,,Occidental Mindoro,PH175100000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30 00:00:00+00:00,2020-05-29 00:00:00+00:00,NaT,"MULTIPOLYGON (((120.58412 13.50198, 120.58420 ..."
2,0.424301,0.006453,Abucay,PH030801000,,,,Bataan,PH030800000,Region III,PH030000000,Philippines (the),PH,2016-06-30 00:00:00+00:00,2020-05-29 00:00:00+00:00,NaT,"MULTIPOLYGON (((120.49873 14.75614, 120.49891 ..."
3,0.566053,0.011343,Abulug,PH021501000,,,,Cagayan,PH021500000,Region II,PH020000000,Philippines (the),PH,2016-06-30 00:00:00+00:00,2020-05-29 00:00:00+00:00,NaT,"MULTIPOLYGON (((121.43455 18.46651, 121.43502 ..."
4,1.013649,0.026124,Abuyog,PH083701000,,,,Leyte,PH083700000,Region VIII,PH080000000,Philippines (the),PH,2016-06-30 00:00:00+00:00,2020-05-29 00:00:00+00:00,NaT,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."


## Drop unncessary columns and lower headers

In [9]:
municipal = municipal.drop(['admin3Pcode', 'admin3RefName','admin3AltName1_en', 'admin3AltName2_en', 'admin2Pcode', 'admin1Pcode', 'Shape_Length', 'Shape_Area', 'admin0Name_en', 'admin0Pcode', 'date', 'validOn', 'validTo'], axis=1)
municipal.head()

Unnamed: 0,admin3Name_en,admin2Name_en,admin1Name_en,geometry
0,Aborlan,Palawan,Region IV-B,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,Abra de Ilog,Occidental Mindoro,Region IV-B,"MULTIPOLYGON (((120.58412 13.50198, 120.58420 ..."
2,Abucay,Bataan,Region III,"MULTIPOLYGON (((120.49873 14.75614, 120.49891 ..."
3,Abulug,Cagayan,Region II,"MULTIPOLYGON (((121.43455 18.46651, 121.43502 ..."
4,Abuyog,Leyte,Region VIII,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."


In [10]:
municipal.columns = municipal.columns.str.lower()
municipal

Unnamed: 0,admin3name_en,admin2name_en,admin1name_en,geometry
0,Aborlan,Palawan,Region IV-B,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,Abra de Ilog,Occidental Mindoro,Region IV-B,"MULTIPOLYGON (((120.58412 13.50198, 120.58420 ..."
2,Abucay,Bataan,Region III,"MULTIPOLYGON (((120.49873 14.75614, 120.49891 ..."
3,Abulug,Cagayan,Region II,"MULTIPOLYGON (((121.43455 18.46651, 121.43502 ..."
4,Abuyog,Leyte,Region VIII,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."
...,...,...,...,...
1642,Zamboanga City,Zamboanga del Sur,Region IX,"MULTIPOLYGON (((122.05710 6.87274, 122.05724 6..."
1643,Zamboanguita,Negros Oriental,Region VII,"MULTIPOLYGON (((123.17078 9.22988, 123.18679 9..."
1644,Zaragoza,Nueva Ecija,Region III,"MULTIPOLYGON (((120.79780 15.48471, 120.80280 ..."
1645,Zarraga,Iloilo,Region VI,"MULTIPOLYGON (((122.63365 10.85890, 122.63857 ..."


## Sorting

Because of large number of cities and municipalities, we need to sort to make it easier for us to match with those in the dataset. We will be using the province column for this.

In [11]:
municipal = municipal.sort_values('admin2name_en')
municipal.head()

Unnamed: 0,admin3name_en,admin2name_en,admin1name_en,geometry
481,Danglas,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.65654 17.64573, 120.65638 ..."
525,Dolores,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.72760 17.66081, 120.72909 ..."
814,Luba,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.68143 17.39224, 120.68160 ..."
723,La Paz,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.70876 17.71055, 120.70932 ..."
150,Bangued,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.60875 17.62596, 120.61649 ..."


## Replacing names

This is so the names of provinces and municipalities are consistent with that of our dataset which we will merge with this file. We did this for the dataset as well.

In [12]:
municipal.admin2name_en = municipal.admin2name_en.str.replace('Compostela Valley', "Davao de Oro", regex=False)
municipal.admin2name_en = municipal.admin2name_en.str.replace('NCR, City of Manila, First District', "NCR", regex=False)
municipal.admin2name_en = municipal.admin2name_en.str.replace('NCR, Second District', "NCR", regex=False)
municipal.admin2name_en = municipal.admin2name_en.str.replace('NCR, Third District', "NCR", regex=False)
municipal.admin2name_en = municipal.admin2name_en.str.replace('NCR, Fourth District', "NCR", regex=False)
municipal.admin3name_en = municipal.admin3name_en.str.replace('Ozamis City', "Ozamiz City", regex=False)
municipal.admin3name_en = municipal.admin3name_en.str.replace('Quiapo', "City of Manila", regex=False) #dummy for entire Manila.

## Merge province and municipal columns

In [13]:
municipal['territory_2']= municipal.admin3name_en + municipal.admin2name_en
municipal.head()

Unnamed: 0,admin3name_en,admin2name_en,admin1name_en,geometry,territory_2
481,Danglas,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.65654 17.64573, 120.65638 ...",DanglasAbra
525,Dolores,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.72760 17.66081, 120.72909 ...",DoloresAbra
814,Luba,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.68143 17.39224, 120.68160 ...",LubaAbra
723,La Paz,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.70876 17.71055, 120.70932 ...",La PazAbra
150,Bangued,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.60875 17.62596, 120.61649 ...",BanguedAbra


In [14]:
municipal.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [15]:
municipal.dtypes

admin3name_en      object
admin2name_en      object
admin1name_en      object
geometry         geometry
territory_2        object
dtype: object

In [16]:
municipal.territory_2= municipal.territory_2.astype(str)
df.territory= df.territory.astype(str)

## Fuzzy pandas

In [17]:
#pd.set_option('display.max_rows', None)
final_df = fpd.fuzzy_merge(municipal, df,
                left_on=['territory_2'],
                right_on=['territory'],
                ignore_case=True,
                ignore_nonalpha=True,
                ignore_nonlatin=True,
                #join='full-outer',
                keep='all')
final_df

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin3name_en,admin2name_en,admin1name_en,geometry,territory_2,province,municipality,covid_cases,territory
0,Danglas,Abra,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,DanglasAbra,Abra,Danglas,132,DanglasAbra
1,Dolores,Abra,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,DoloresAbra,Abra,Dolores,419,DoloresAbra
2,Luba,Abra,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,LubaAbra,Abra,Luba,240,LubaAbra
3,La Paz,Abra,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,La PazAbra,Abra,La Paz,313,La PazAbra
4,Bangued,Abra,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,BanguedAbra,Abra,Bangued,1231,Bangued Abra
...,...,...,...,...,...,...,...,...,...
1628,Pitogo,Zamboanga del Sur,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,PitogoZamboanga del Sur,Zamboanga Del Sur,Pitogo,144,PitogoZamboanga Del Sur
1629,Aurora,Zamboanga del Sur,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,AuroraZamboanga del Sur,Zamboanga Del Sur,Aurora,670,AuroraZamboanga Del Sur
1630,San Pablo,Zamboanga del Sur,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,San PabloZamboanga del Sur,Zamboanga Del Sur,San Pablo,139,San PabloZamboanga Del Sur
1631,Dumingag,Zamboanga del Sur,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,DumingagZamboanga del Sur,Zamboanga Del Sur,Dumingag,328,DumingagZamboanga Del Sur


#### Note: 9,163 cases from different provinces did not have municipal locations.

## Read population data

In [18]:
df2 = pd.read_excel('population.xlsx', sheet_name="municipality")
df2

Unnamed: 0,municipality,province,population
0,CITY OF MANILA,NCR,1846513
1,CITY OF MANDALUYONG,NCR,425758
2,CITY OF MARIKINA,NCR,456059
3,CITY OF PASIG,NCR,803159
4,QUEZON CITY,NCR,2960048
...,...,...,...
1629,TAGBINA,Surigao del Sur,41051
1630,TAGO,Surigao del Sur,39831
1631,CITY OF TANDAG,Surigao del Sur,62669
1632,CITY OF ISABELA,City of Isabela,130379


## Clean data a bit

In [19]:
df2.municipality = df2.municipality.str.title() #capitalizes and lower case the rest of the name an
df2.municipality = df2.municipality.str.replace('Of', "of", regex=True)
df2.municipality = df2.municipality.str.replace('Del', "del", regex=True)
df2.municipality = df2.municipality.str.replace(r'[(].*$', "", regex=True) #removes all characters after (

## Merge 'province' and 'municipal' columns

In [20]:
df2['territory_3']= df2.municipality + df2.province
df2

Unnamed: 0,municipality,province,population,territory_3
0,City of Manila,NCR,1846513,City of ManilaNCR
1,City of Mandaluyong,NCR,425758,City of MandaluyongNCR
2,City of Marikina,NCR,456059,City of MarikinaNCR
3,City of Pasig,NCR,803159,City of PasigNCR
4,Quezon City,NCR,2960048,Quezon CityNCR
...,...,...,...,...
1629,Tagbina,Surigao del Sur,41051,TagbinaSurigao del Sur
1630,Tago,Surigao del Sur,39831,TagoSurigao del Sur
1631,City of Tandag,Surigao del Sur,62669,City of Tandag Surigao del Sur
1632,City of Isabela,City of Isabela,130379,City of IsabelaCity of Isabela


In [21]:
df2.territory_3= df2.territory_3.astype(str)

## Fuzzy pandas... again

This time to combine existing df with our population data.

In [22]:
final_df2 = fpd.fuzzy_merge(final_df, df2,
                left_on=['territory'],
                right_on=['territory_3'],
                ignore_case=True,
                ignore_nonalpha=True,
                ignore_nonlatin=True,
                #join='full-outer',
                keep='all')
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin3name_en,admin2name_en,admin1name_en,geometry,territory_2,province,municipality,covid_cases,territory,municipality.1,province.1,population,territory_3
0,Danglas,Abra,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,DanglasAbra,Abra,Danglas,132,DanglasAbra,Danglas,Abra,4074,DanglasAbra
1,Dolores,Abra,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,DoloresAbra,Abra,Dolores,419,DoloresAbra,Dolores,Abra,11512,DoloresAbra
2,Luba,Abra,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,LubaAbra,Abra,Luba,240,LubaAbra,Luba,Abra,6518,LubaAbra
3,La Paz,Abra,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,La PazAbra,Abra,La Paz,313,La PazAbra,La Paz,Abra,16493,La PazAbra
4,Bangued,Abra,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,BanguedAbra,Abra,Bangued,1231,Bangued Abra,Bangued,Abra,50382,BanguedAbra
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,Pitogo,Zamboanga del Sur,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,PitogoZamboanga del Sur,Zamboanga Del Sur,Pitogo,144,PitogoZamboanga Del Sur,Pitogo,Zamboanga del Sur,27516,PitogoZamboanga del Sur
1629,Aurora,Zamboanga del Sur,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,AuroraZamboanga del Sur,Zamboanga Del Sur,Aurora,670,AuroraZamboanga Del Sur,Aurora,Zamboanga del Sur,52995,AuroraZamboanga del Sur
1630,San Pablo,Zamboanga del Sur,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,San PabloZamboanga del Sur,Zamboanga Del Sur,San Pablo,139,San PabloZamboanga Del Sur,San Pablo,Zamboanga del Sur,26648,San PabloZamboanga del Sur
1631,Dumingag,Zamboanga del Sur,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,DumingagZamboanga del Sur,Zamboanga Del Sur,Dumingag,328,DumingagZamboanga Del Sur,Dumingag,Zamboanga del Sur,48881,DumingagZamboanga del Sur


## Drop extra columns

In [23]:
final_df2 = final_df2.drop(['admin2name_en', 'admin3name_en', 'territory_2', 'territory', 'territory_3'], axis=1)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,municipality.1,province.1,population
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,132,Danglas,Abra,4074
1,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,Abra,Dolores,419,Dolores,Abra,11512
2,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,Abra,Luba,240,Luba,Abra,6518
3,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,Abra,La Paz,313,La Paz,Abra,16493
4,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,Abra,Bangued,1231,Bangued,Abra,50382
...,...,...,...,...,...,...,...,...
1628,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,144,Pitogo,Zamboanga del Sur,27516
1629,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,Zamboanga Del Sur,Aurora,670,Aurora,Zamboanga del Sur,52995
1630,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,139,San Pablo,Zamboanga del Sur,26648
1631,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,Zamboanga Del Sur,Dumingag,328,Dumingag,Zamboanga del Sur,48881


## Rename columns

We are renaming the columns so that we can drop redundant ones with the same names.

In [24]:
final_df2.columns= ['admin1name_en', 'geometry', 'province', 'municipality', 'covid_cases', 'municipality2', 'province2', 'population']
final_df2 = final_df2.drop(['municipality2', 'province2'], axis=1)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,132,4074
1,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,Abra,Dolores,419,11512
2,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,Abra,Luba,240,6518
3,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,Abra,La Paz,313,16493
4,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,Abra,Bangued,1231,50382
...,...,...,...,...,...,...
1628,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,144,27516
1629,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,Zamboanga Del Sur,Aurora,670,52995
1630,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,139,26648
1631,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,Zamboanga Del Sur,Dumingag,328,48881


## Compute for population ratio

We do this by dividing the number of Covid-19 cases to total population per municipality and then multiply by 10,000. That would give us cases per 10,000 people in the area.

In [25]:
final_df2 ['case_per_pop'] = final_df2.covid_cases / final_df2.population * 10000
final_df2 = final_df2.round(1)
final_df2.head()

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population,case_per_pop
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,132,4074,324.0
1,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,Abra,Dolores,419,11512,364.0
2,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,Abra,Luba,240,6518,368.2
3,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,Abra,La Paz,313,16493,189.8
4,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,Abra,Bangued,1231,50382,244.3


In [26]:
final_df2.sort_values('case_per_pop', ascending=False)

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population,case_per_pop
312,Region II,(POLYGON ((121.0356007790001 18.62039078100002...,Cagayan,Santa Praxedes,864,4434,1948.6
1029,National Capital Region,(POLYGON ((121.02626267700009 14.6122737330000...,NCR,City of San Juan,24211,126347,1916.2
1025,National Capital Region,(POLYGON ((121.0699916430001 14.55227279900003...,NCR,Pateros,10864,65227,1665.6
1023,National Capital Region,(POLYGON ((121.03467973800002 14.5672476350000...,NCR,City of Makati,96342,629616,1530.2
1027,National Capital Region,(POLYGON ((120.98042364000003 14.5615638960000...,NCR,Pasay City,58925,440656,1337.2
...,...,...,...,...,...,...,...
129,Autonomous Region in Muslim Mindanao,(POLYGON ((121.99379767100004 6.28772321100007...,Basilan,Tabuan-Lasa,5,29327,1.7
829,Autonomous Region in Muslim Mindanao,(POLYGON ((124.62352208900006 7.88802177200005...,Lanao Del Sur,Bumbaran,2,12124,1.6
1478,Autonomous Region in Muslim Mindanao,(POLYGON ((121.01278876900005 5.68534327400004...,Sulu,Tapul,3,20799,1.4
1538,Autonomous Region in Muslim Mindanao,(POLYGON ((118.4009098890001 6.878896047000069...,Tawi-Tawi,Mapun,3,30038,1.0


## Create bins for cases

The bins will allow us to categorize the number of cases, necessary for mapping later.

In [27]:
final_df2['percentiles'] = pd.cut(np.array(final_df2['case_per_pop']),
       [0, 201, 401, 601, 801, 1001, 1201, 1401, 1601, 1801, 2000], labels=["0-200", "201-400", "401-600", "601-800", "801-1000", "1001-1200", "1201-1400", "1401-1600", "1601-1800", "1801-2000"])
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population,case_per_pop,percentiles
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,132,4074,324.0,201-400
1,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,Abra,Dolores,419,11512,364.0,201-400
2,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,Abra,Luba,240,6518,368.2,201-400
3,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,Abra,La Paz,313,16493,189.8,0-200
4,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,Abra,Bangued,1231,50382,244.3,201-400
...,...,...,...,...,...,...,...,...
1628,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,144,27516,52.3,0-200
1629,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,Zamboanga Del Sur,Aurora,670,52995,126.4,0-200
1630,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,139,26648,52.2,0-200
1631,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,Zamboanga Del Sur,Dumingag,328,48881,67.1,0-200


**Additional step**: Convert the contents of the percentiles into string. Not doing so will not be read by the GEOJSON file.

In [28]:
final_df2.percentiles = final_df2.percentiles.astype(str)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population,case_per_pop,percentiles
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,132,4074,324.0,201-400
1,Cordillera Administrative Region,(POLYGON ((120.72760314700008 17.6608078750000...,Abra,Dolores,419,11512,364.0,201-400
2,Cordillera Administrative Region,(POLYGON ((120.6814333860001 17.39223621900004...,Abra,Luba,240,6518,368.2,201-400
3,Cordillera Administrative Region,(POLYGON ((120.70875978400011 17.7105503800000...,Abra,La Paz,313,16493,189.8,0-200
4,Cordillera Administrative Region,(POLYGON ((120.6087521930001 17.62596122600007...,Abra,Bangued,1231,50382,244.3,201-400
...,...,...,...,...,...,...,...,...
1628,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,144,27516,52.3,0-200
1629,Region IX,(POLYGON ((123.54540352700008 7.99904253200003...,Zamboanga Del Sur,Aurora,670,52995,126.4,0-200
1630,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,139,26648,52.2,0-200
1631,Region IX,(POLYGON ((123.27956563500004 8.23652138300002...,Zamboanga Del Sur,Dumingag,328,48881,67.1,0-200


# Convert 'geometry' column back to a geopandas df

We lost that when we ran fuzzy pandas.

In [29]:
final_df2 = gpd.GeoDataFrame(final_df2, crs="EPSG:4326", geometry=final_df2.geometry)

In [30]:
final_df2

Unnamed: 0,admin1name_en,geometry,province,municipality,covid_cases,population,case_per_pop,percentiles
0,Cordillera Administrative Region,"MULTIPOLYGON (((120.65654 17.64573, 120.65638 ...",Abra,Danglas,132,4074,324.0,201-400
1,Cordillera Administrative Region,"MULTIPOLYGON (((120.72760 17.66081, 120.72909 ...",Abra,Dolores,419,11512,364.0,201-400
2,Cordillera Administrative Region,"MULTIPOLYGON (((120.68143 17.39224, 120.68160 ...",Abra,Luba,240,6518,368.2,201-400
3,Cordillera Administrative Region,"MULTIPOLYGON (((120.70876 17.71055, 120.70932 ...",Abra,La Paz,313,16493,189.8,0-200
4,Cordillera Administrative Region,"MULTIPOLYGON (((120.60875 17.62596, 120.61649 ...",Abra,Bangued,1231,50382,244.3,201-400
...,...,...,...,...,...,...,...,...
1628,Region IX,"MULTIPOLYGON (((123.32953 7.36853, 123.32910 7...",Zamboanga Del Sur,Pitogo,144,27516,52.3,0-200
1629,Region IX,"MULTIPOLYGON (((123.54540 7.99904, 123.54673 7...",Zamboanga Del Sur,Aurora,670,52995,126.4,0-200
1630,Region IX,"MULTIPOLYGON (((123.45763 7.61576, 123.45765 7...",Zamboanga Del Sur,San Pablo,139,26648,52.2,0-200
1631,Region IX,"MULTIPOLYGON (((123.27957 8.23652, 123.30633 8...",Zamboanga Del Sur,Dumingag,328,48881,67.1,0-200


# Save as GEOJSON file

In [31]:
# final_df2.to_file('municipal.geojson', driver='GeoJSON')

# Simplified file

So we were successful in combining geometry files with our dataset, but the file is too big. We, therefore, use [mapshaper](https://mapshaper.org/) to simplify the precision of the map so that we have a smaller map size.

Below is the simplified json file. 

In [46]:
simplified = gpd.read_file('municipal.json')
simplified

Unnamed: 0,admin1name_en,province,municipality,covid_cases,population,case_per_pop,percentiles,geometry
0,Cordillera Administrative Region,Abra,Danglas,132,4074,324.0,201-400,"MULTIPOLYGON (((120.65654 17.64573, 120.65656 ..."
1,Cordillera Administrative Region,Abra,Dolores,419,11512,364.0,201-400,"POLYGON ((120.77803 17.66674, 120.76612 17.666..."
2,Cordillera Administrative Region,Abra,Luba,240,6518,368.2,201-400,"POLYGON ((120.68189 17.39219, 120.68134 17.392..."
3,Cordillera Administrative Region,Abra,La Paz,313,16493,189.8,0-200,"POLYGON ((120.70876 17.71055, 120.70783 17.710..."
4,Cordillera Administrative Region,Abra,Bangued,1231,50382,244.3,201-400,"POLYGON ((120.61921 17.62751, 120.61748 17.626..."
...,...,...,...,...,...,...,...,...
1628,Region IX,Zamboanga Del Sur,Pitogo,144,27516,52.3,0-200,"MULTIPOLYGON (((123.32953 7.36853, 123.32953 7..."
1629,Region IX,Zamboanga Del Sur,Aurora,670,52995,126.4,0-200,"POLYGON ((123.64055 7.98886, 123.63926 7.98924..."
1630,Region IX,Zamboanga Del Sur,San Pablo,139,26648,52.2,0-200,"MULTIPOLYGON (((123.45763 7.61576, 123.45751 7..."
1631,Region IX,Zamboanga Del Sur,Dumingag,328,48881,67.1,0-200,"POLYGON ((123.38109 8.22467, 123.37773 8.22477..."


## Convert to GEOJSON

In [47]:
simplified.to_file('simplified_municipalities.geojson', driver='GeoJSON')

In [48]:
simplified

Unnamed: 0,admin1name_en,province,municipality,covid_cases,population,case_per_pop,percentiles,geometry
0,Cordillera Administrative Region,Abra,Danglas,132,4074,324.0,201-400,"MULTIPOLYGON (((120.65654 17.64573, 120.65656 ..."
1,Cordillera Administrative Region,Abra,Dolores,419,11512,364.0,201-400,"POLYGON ((120.77803 17.66674, 120.76612 17.666..."
2,Cordillera Administrative Region,Abra,Luba,240,6518,368.2,201-400,"POLYGON ((120.68189 17.39219, 120.68134 17.392..."
3,Cordillera Administrative Region,Abra,La Paz,313,16493,189.8,0-200,"POLYGON ((120.70876 17.71055, 120.70783 17.710..."
4,Cordillera Administrative Region,Abra,Bangued,1231,50382,244.3,201-400,"POLYGON ((120.61921 17.62751, 120.61748 17.626..."
...,...,...,...,...,...,...,...,...
1628,Region IX,Zamboanga Del Sur,Pitogo,144,27516,52.3,0-200,"MULTIPOLYGON (((123.32953 7.36853, 123.32953 7..."
1629,Region IX,Zamboanga Del Sur,Aurora,670,52995,126.4,0-200,"POLYGON ((123.64055 7.98886, 123.63926 7.98924..."
1630,Region IX,Zamboanga Del Sur,San Pablo,139,26648,52.2,0-200,"MULTIPOLYGON (((123.45763 7.61576, 123.45751 7..."
1631,Region IX,Zamboanga Del Sur,Dumingag,328,48881,67.1,0-200,"POLYGON ((123.38109 8.22467, 123.37773 8.22477..."


## Clean CSV for uploading to website

In [49]:
simplified = simplified.drop(['population', 'geometry', 'percentiles'], axis=1)
simplified.columns = ['Region', 'Province', 'Municipality', 'Covid-19 cases', 'Case per 10,000 population']
simplified

Unnamed: 0,Region,Province,Municipality,Covid-19 cases,"Case per 10,000 population"
0,Cordillera Administrative Region,Abra,Danglas,132,324.0
1,Cordillera Administrative Region,Abra,Dolores,419,364.0
2,Cordillera Administrative Region,Abra,Luba,240,368.2
3,Cordillera Administrative Region,Abra,La Paz,313,189.8
4,Cordillera Administrative Region,Abra,Bangued,1231,244.3
...,...,...,...,...,...
1628,Region IX,Zamboanga Del Sur,Pitogo,144,52.3
1629,Region IX,Zamboanga Del Sur,Aurora,670,126.4
1630,Region IX,Zamboanga Del Sur,San Pablo,139,52.2
1631,Region IX,Zamboanga Del Sur,Dumingag,328,67.1


In [50]:
simplified = simplified[['Municipality', 'Province', 'Region', 'Covid-19 cases', 'Case per 10,000 population']]
simplified

Unnamed: 0,Municipality,Province,Region,Covid-19 cases,"Case per 10,000 population"
0,Danglas,Abra,Cordillera Administrative Region,132,324.0
1,Dolores,Abra,Cordillera Administrative Region,419,364.0
2,Luba,Abra,Cordillera Administrative Region,240,368.2
3,La Paz,Abra,Cordillera Administrative Region,313,189.8
4,Bangued,Abra,Cordillera Administrative Region,1231,244.3
...,...,...,...,...,...
1628,Pitogo,Zamboanga Del Sur,Region IX,144,52.3
1629,Aurora,Zamboanga Del Sur,Region IX,670,126.4
1630,San Pablo,Zamboanga Del Sur,Region IX,139,52.2
1631,Dumingag,Zamboanga Del Sur,Region IX,328,67.1


In [51]:
simplified.to_csv('../data/municipalities.csv', index=False)