# Cleaning Covid data for Mapbox: municipal

**Background**: We use Covid-19 cases data in the Philippines from data from the health department and shapefile processed through geopandas to create an interactive map. 

**Tools**: pandas, geopandas, Mapbox

# Do your imports

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandas as pd
import fuzzy_pandas as fpd

import shapely

pd.set_option('display.max_columns', None)

# Read your CSV

In [2]:
df= pd.read_csv('municipalities.csv')
df

Unnamed: 0,ProvRes,CityMunRes,CityMunRes.1
0,Abra,Bangued (Capital),1082
1,Abra,Dolores,399
2,Abra,Sallapadan,389
3,Abra,Bucay,317
4,Abra,Pidigan,299
...,...,...,...
1709,Zamboanga Sibugay,Olutanga,107
1710,Zamboanga Sibugay,Payao,96
1711,Zamboanga Sibugay,Mabuhay,95
1712,Zamboanga Sibugay,Talusan,50


# Cleaning the data

## Lowercase column headers

In [3]:
df.columns = df.columns.str.lower()
df.head(10)

Unnamed: 0,provres,citymunres,citymunres.1
0,Abra,Bangued (Capital),1082
1,Abra,Dolores,399
2,Abra,Sallapadan,389
3,Abra,Bucay,317
4,Abra,Pidigan,299
5,Abra,Lagangilang,270
6,Abra,Lagayan,266
7,Abra,La Paz,261
8,Abra,Villaviciosa,258
9,Abra,San Juan,251


## Change names

In [4]:
df.citymunres = df.citymunres.str.replace('(Capital)', "", regex=False)
df.citymunres = df.citymunres.str.replace('Of', "of", regex=False)
df.provres = df.provres.str.replace('Samar (Western Samar)', 'Samar', regex=False)
df.provres = df.provres.str.replace('Cotabato (North Cotabato)', 'Cotabato', regex=False)
df.provres = df.provres.str.replace('Ncr', 'NCR', regex=False)
df.provres = df.provres.str.replace('Cotabato City (Not A Province)', 'Cotabato City', regex=False)
df.provres = df.provres.str.replace('City Of Isabela (Not A Province)', 'City of Isabela', regex=False)
df.citymunres = df.citymunres.str.replace(r'[(].*$', "", regex=True)

## Look for NaN values

In [5]:
df.isna().sum()

provres          0
citymunres      79
citymunres.1     0
dtype: int64

## Rename columns

This is again to match the shapefile column containing the provinces' names which is 'adm2_en'. We are also renaming the column containing the Covid-19 cases tally.

In [6]:
df= df.rename(columns={"provres": "province"})
df= df.rename(columns={"citymunres": "municipality"})
df= df.rename(columns={"citymunres.1": "covid_cases"})
df.head()

Unnamed: 0,province,municipality,covid_cases
0,Abra,Bangued,1082
1,Abra,Dolores,399
2,Abra,Sallapadan,389
3,Abra,Bucay,317
4,Abra,Pidigan,299


## Merge province and municipality columns

Make a new column containing both

In [7]:
df['territory'] = df.municipality + df.province
df

Unnamed: 0,province,municipality,covid_cases,territory
0,Abra,Bangued,1082,Bangued Abra
1,Abra,Dolores,399,DoloresAbra
2,Abra,Sallapadan,389,SallapadanAbra
3,Abra,Bucay,317,BucayAbra
4,Abra,Pidigan,299,PidiganAbra
...,...,...,...,...
1709,Zamboanga Sibugay,Olutanga,107,OlutangaZamboanga Sibugay
1710,Zamboanga Sibugay,Payao,96,PayaoZamboanga Sibugay
1711,Zamboanga Sibugay,Mabuhay,95,MabuhayZamboanga Sibugay
1712,Zamboanga Sibugay,Talusan,50,TalusanZamboanga Sibugay


# Geopandas

## Read through file

In [8]:
municipal = gpd.read_file('municipal_map.zip')
municipal.head()

Unnamed: 0,Shape_Leng,Shape_Area,ADM3_EN,ADM3_PCODE,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geometry
0,1.601219,0.063496,Aborlan,PH175301000,,,,Palawan,PH175300000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,1.078749,0.050232,Abra de Ilog,PH175101000,,,,Occidental Mindoro,PH175100000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((120.58412 13.50198, 120.58420 13.501..."
2,0.424301,0.006453,Abucay,PH030801000,,,,Bataan,PH030800000,Region III,PH030000000,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((120.49873 14.75614, 120.49891 14.755..."
3,0.566053,0.011343,Abulug,PH021501000,,,,Cagayan,PH021500000,Region II,PH020000000,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((121.43455 18.46651, 121.43502 18.466..."
4,1.013649,0.026124,Abuyog,PH083701000,,,,Leyte,PH083700000,Region VIII,PH080000000,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."


## Drop unncessary columns and lower headers

In [9]:
municipal = municipal.drop(['ADM3_PCODE', 'ADM3_REF','ADM3ALT1EN', 'ADM3ALT2EN', 'ADM2_PCODE', 'ADM1_PCODE', 'Shape_Leng', 'Shape_Area', 'ADM0_EN', 'date', 'validOn', 'validTo'], axis=1)
municipal.head()

Unnamed: 0,ADM3_EN,ADM2_EN,ADM1_EN,ADM0_PCODE,geometry
0,Aborlan,Palawan,Region IV-B,PH,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,Abra de Ilog,Occidental Mindoro,Region IV-B,PH,"POLYGON ((120.58412 13.50198, 120.58420 13.501..."
2,Abucay,Bataan,Region III,PH,"POLYGON ((120.49873 14.75614, 120.49891 14.755..."
3,Abulug,Cagayan,Region II,PH,"POLYGON ((121.43455 18.46651, 121.43502 18.466..."
4,Abuyog,Leyte,Region VIII,PH,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."


In [10]:
municipal = municipal.drop('ADM0_PCODE', axis=1)

In [11]:
municipal.columns = municipal.columns.str.lower()
municipal

Unnamed: 0,adm3_en,adm2_en,adm1_en,geometry
0,Aborlan,Palawan,Region IV-B,"MULTIPOLYGON (((118.58350 9.37700, 118.58398 9..."
1,Abra de Ilog,Occidental Mindoro,Region IV-B,"POLYGON ((120.58412 13.50198, 120.58420 13.501..."
2,Abucay,Bataan,Region III,"POLYGON ((120.49873 14.75614, 120.49891 14.755..."
3,Abulug,Cagayan,Region II,"POLYGON ((121.43455 18.46651, 121.43502 18.466..."
4,Abuyog,Leyte,Region VIII,"MULTIPOLYGON (((125.02684 10.73500, 125.02683 ..."
...,...,...,...,...
1642,Zamboanga City,Zamboanga del Sur,Region IX,"MULTIPOLYGON (((122.05710 6.87274, 122.05724 6..."
1643,Zamboanguita,Negros Oriental,Region VII,"POLYGON ((123.17078 9.22988, 123.18679 9.21804..."
1644,Zaragoza,Nueva Ecija,Region III,"POLYGON ((120.79780 15.48471, 120.80280 15.480..."
1645,Zarraga,Iloilo,Region VI,"POLYGON ((122.63365 10.85890, 122.63857 10.857..."


## Sorting

Because of large number of cities and municipalities, we need to sort to make it easier for us to match with those in the dataset. We will be using the province column for this.

In [12]:
municipal = municipal.sort_values('adm2_en')
municipal.head()

Unnamed: 0,adm3_en,adm2_en,adm1_en,geometry
481,Danglas,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.65654 17.64573, 120.65638 ..."
525,Dolores,Abra,Cordillera Administrative Region,"POLYGON ((120.72760 17.66081, 120.72909 17.659..."
814,Luba,Abra,Cordillera Administrative Region,"POLYGON ((120.68143 17.39224, 120.68160 17.392..."
723,La Paz,Abra,Cordillera Administrative Region,"POLYGON ((120.70876 17.71055, 120.70932 17.709..."
150,Bangued,Abra,Cordillera Administrative Region,"POLYGON ((120.60875 17.62596, 120.61649 17.625..."


## Replacing names

This is so the names of provinces and municipalities are consistent with that of our dataset which we will merge with this file. We did this for the dataset as well.

In [13]:
municipal.adm2_en = municipal.adm2_en.str.replace('Compostela Valley', "Davao de Oro", regex=False)
municipal.adm2_en = municipal.adm2_en.str.replace('NCR, City of Manila, First District', "NCR", regex=False)
municipal.adm2_en = municipal.adm2_en.str.replace('NCR, Second District', "NCR", regex=False)
municipal.adm2_en = municipal.adm2_en.str.replace('NCR, Third District', "NCR", regex=False)
municipal.adm2_en = municipal.adm2_en.str.replace('NCR, Fourth District', "NCR", regex=False)
municipal.adm3_en = municipal.adm3_en.str.replace('Ozamis City', "Ozamiz City", regex=False)
municipal.adm3_en = municipal.adm3_en.str.replace('Quiapo', "City of Manila", regex=False) #dummy for entire Manila.

## Merge province and municipal columns

In [14]:
municipal['territory_2']= municipal.adm3_en + municipal.adm2_en
municipal.head()

Unnamed: 0,adm3_en,adm2_en,adm1_en,geometry,territory_2
481,Danglas,Abra,Cordillera Administrative Region,"MULTIPOLYGON (((120.65654 17.64573, 120.65638 ...",DanglasAbra
525,Dolores,Abra,Cordillera Administrative Region,"POLYGON ((120.72760 17.66081, 120.72909 17.659...",DoloresAbra
814,Luba,Abra,Cordillera Administrative Region,"POLYGON ((120.68143 17.39224, 120.68160 17.392...",LubaAbra
723,La Paz,Abra,Cordillera Administrative Region,"POLYGON ((120.70876 17.71055, 120.70932 17.709...",La PazAbra
150,Bangued,Abra,Cordillera Administrative Region,"POLYGON ((120.60875 17.62596, 120.61649 17.625...",BanguedAbra


In [15]:
municipal.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [16]:
municipal.dtypes

adm3_en          object
adm2_en          object
adm1_en          object
geometry       geometry
territory_2      object
dtype: object

In [17]:
municipal.territory_2= municipal.territory_2.astype(str)
df.territory= df.territory.astype(str)

## Fuzzy pandas

In [18]:
#pd.set_option('display.max_rows', None)
final_df = fpd.fuzzy_merge(municipal, df,
                left_on=['territory_2'],
                right_on=['territory'],
                ignore_case=True,
                ignore_nonalpha=True,
                ignore_nonlatin=True,
                #join='full-outer',
                keep='all')
final_df

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm3_en,adm2_en,adm1_en,geometry,territory_2,province,municipality,covid_cases,territory
0,Danglas,Abra,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,DanglasAbra,Abra,Danglas,126,DanglasAbra
1,Dolores,Abra,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,DoloresAbra,Abra,Dolores,399,DoloresAbra
2,Luba,Abra,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",LubaAbra,Abra,Luba,232,LubaAbra
3,La Paz,Abra,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,La PazAbra,Abra,La Paz,261,La PazAbra
4,Bangued,Abra,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,BanguedAbra,Abra,Bangued,1082,Bangued Abra
...,...,...,...,...,...,...,...,...,...
1627,Pitogo,Zamboanga del Sur,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,PitogoZamboanga del Sur,Zamboanga Del Sur,Pitogo,122,PitogoZamboanga Del Sur
1628,Aurora,Zamboanga del Sur,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,AuroraZamboanga del Sur,Zamboanga Del Sur,Aurora,620,AuroraZamboanga Del Sur
1629,San Pablo,Zamboanga del Sur,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,San PabloZamboanga del Sur,Zamboanga Del Sur,San Pablo,123,San PabloZamboanga Del Sur
1630,Dumingag,Zamboanga del Sur,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,DumingagZamboanga del Sur,Zamboanga Del Sur,Dumingag,298,DumingagZamboanga Del Sur


#### Note: 9,163 cases from different provinces did not have municipal locations.

## Read population data

In [19]:
df2 = pd.read_excel('population.xlsx', sheet_name="municipality")
df2

Unnamed: 0,municipality,province,population
0,CITY OF MANILA,NCR,1846513
1,CITY OF MANDALUYONG,NCR,425758
2,CITY OF MARIKINA,NCR,456059
3,CITY OF PASIG,NCR,803159
4,QUEZON CITY,NCR,2960048
...,...,...,...
1629,TAGBINA,Surigao del Sur,41051
1630,TAGO,Surigao del Sur,39831
1631,CITY OF TANDAG,Surigao del Sur,62669
1632,CITY OF ISABELA,City of Isabela,130379


## Clean data a bit

In [20]:
df2.municipality = df2.municipality.str.title() #capitalizes and lower case the rest of the name an
df2.municipality = df2.municipality.str.replace('Of', "of", regex=True)
df2.municipality = df2.municipality.str.replace('Del', "del", regex=True)
df2.municipality = df2.municipality.str.replace(r'[(].*$', "", regex=True) #removes all characters after (

## Merge 'province' and 'municipal' columns

In [21]:
df2['territory_3']= df2.municipality + df2.province
df2

Unnamed: 0,municipality,province,population,territory_3
0,City of Manila,NCR,1846513,City of ManilaNCR
1,City of Mandaluyong,NCR,425758,City of MandaluyongNCR
2,City of Marikina,NCR,456059,City of MarikinaNCR
3,City of Pasig,NCR,803159,City of PasigNCR
4,Quezon City,NCR,2960048,Quezon CityNCR
...,...,...,...,...
1629,Tagbina,Surigao del Sur,41051,TagbinaSurigao del Sur
1630,Tago,Surigao del Sur,39831,TagoSurigao del Sur
1631,City of Tandag,Surigao del Sur,62669,City of Tandag Surigao del Sur
1632,City of Isabela,City of Isabela,130379,City of IsabelaCity of Isabela


In [22]:
df2.territory_3= df2.territory_3.astype(str)

## Fuzzy pandas... again

This time to combine existing df with our population data.

In [23]:
final_df2 = fpd.fuzzy_merge(final_df, df2,
                left_on=['territory'],
                right_on=['territory_3'],
                ignore_case=True,
                ignore_nonalpha=True,
                ignore_nonlatin=True,
                #join='full-outer',
                keep='all')
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm3_en,adm2_en,adm1_en,geometry,territory_2,province,municipality,covid_cases,territory,municipality.1,province.1,population,territory_3
0,Danglas,Abra,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,DanglasAbra,Abra,Danglas,126,DanglasAbra,Danglas,Abra,4074,DanglasAbra
1,Dolores,Abra,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,DoloresAbra,Abra,Dolores,399,DoloresAbra,Dolores,Abra,11512,DoloresAbra
2,Luba,Abra,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",LubaAbra,Abra,Luba,232,LubaAbra,Luba,Abra,6518,LubaAbra
3,La Paz,Abra,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,La PazAbra,Abra,La Paz,261,La PazAbra,La Paz,Abra,16493,La PazAbra
4,Bangued,Abra,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,BanguedAbra,Abra,Bangued,1082,Bangued Abra,Bangued,Abra,50382,BanguedAbra
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627,Pitogo,Zamboanga del Sur,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,PitogoZamboanga del Sur,Zamboanga Del Sur,Pitogo,122,PitogoZamboanga Del Sur,Pitogo,Zamboanga del Sur,27516,PitogoZamboanga del Sur
1628,Aurora,Zamboanga del Sur,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,AuroraZamboanga del Sur,Zamboanga Del Sur,Aurora,620,AuroraZamboanga Del Sur,Aurora,Zamboanga del Sur,52995,AuroraZamboanga del Sur
1629,San Pablo,Zamboanga del Sur,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,San PabloZamboanga del Sur,Zamboanga Del Sur,San Pablo,123,San PabloZamboanga Del Sur,San Pablo,Zamboanga del Sur,26648,San PabloZamboanga del Sur
1630,Dumingag,Zamboanga del Sur,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,DumingagZamboanga del Sur,Zamboanga Del Sur,Dumingag,298,DumingagZamboanga Del Sur,Dumingag,Zamboanga del Sur,48881,DumingagZamboanga del Sur


## Drop extra columns

In [24]:
final_df2 = final_df2.drop(['adm2_en', 'adm3_en', 'territory_2', 'territory', 'territory_3'], axis=1)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm1_en,geometry,province,municipality,covid_cases,municipality.1,province.1,population
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,126,Danglas,Abra,4074
1,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,Abra,Dolores,399,Dolores,Abra,11512
2,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",Abra,Luba,232,Luba,Abra,6518
3,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,Abra,La Paz,261,La Paz,Abra,16493
4,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,Abra,Bangued,1082,Bangued,Abra,50382
...,...,...,...,...,...,...,...,...
1627,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,122,Pitogo,Zamboanga del Sur,27516
1628,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,Zamboanga Del Sur,Aurora,620,Aurora,Zamboanga del Sur,52995
1629,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,123,San Pablo,Zamboanga del Sur,26648
1630,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,Zamboanga Del Sur,Dumingag,298,Dumingag,Zamboanga del Sur,48881


## Rename columns

We are renaming the columns so that we can drop redundant ones with the same names.

In [25]:
final_df2.columns= ['adm1_en', 'geometry', 'province', 'municipality', 'covid_cases', 'municipality2', 'province2', 'population']
final_df2 = final_df2.drop(['municipality2', 'province2'], axis=1)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm1_en,geometry,province,municipality,covid_cases,population
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,126,4074
1,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,Abra,Dolores,399,11512
2,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",Abra,Luba,232,6518
3,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,Abra,La Paz,261,16493
4,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,Abra,Bangued,1082,50382
...,...,...,...,...,...,...
1627,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,122,27516
1628,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,Zamboanga Del Sur,Aurora,620,52995
1629,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,123,26648
1630,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,Zamboanga Del Sur,Dumingag,298,48881


## Compute for population ratio

We do this by dividing the number of Covid-19 cases to total population per municipality and then multiply by 10,000. That would give us cases per 10,000 people in the area.

In [26]:
final_df2 ['case_per_pop'] = final_df2.covid_cases / final_df2.population * 10000
final_df2 = final_df2.round(1)
final_df2.head()

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm1_en,geometry,province,municipality,covid_cases,population,case_per_pop
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,126,4074,309.3
1,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,Abra,Dolores,399,11512,346.6
2,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",Abra,Luba,232,6518,355.9
3,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,Abra,La Paz,261,16493,158.2
4,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,Abra,Bangued,1082,50382,214.8
...,...,...,...,...,...,...,...
1627,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,122,27516,44.3
1628,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,Zamboanga Del Sur,Aurora,620,52995,117.0
1629,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,123,26648,46.2
1630,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,Zamboanga Del Sur,Dumingag,298,48881,61.0


## Create bins for cases

The bins will allow us to categorize the number of cases, necessary for mapping later.

In [34]:
final_df2['percentiles'] = pd.cut(np.array(final_df2['case_per_pop']),
       [0, 201, 401, 601, 801, 1001, 1201, 1401, 1601, 1801, 2000], labels=["0-200", "201-400", "401-600", "601-800", "801-1000", "1001-1200", "1201-1400", "1401-1600", "1601-1800", "1801-2000"])
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm1_en,geometry,province,municipality,covid_cases,population,case_per_pop,percentiles
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,126,4074,309.3,201-400
1,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,Abra,Dolores,399,11512,346.6,201-400
2,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",Abra,Luba,232,6518,355.9,201-400
3,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,Abra,La Paz,261,16493,158.2,0-200
4,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,Abra,Bangued,1082,50382,214.8,201-400
...,...,...,...,...,...,...,...,...
1627,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,122,27516,44.3,0-200
1628,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,Zamboanga Del Sur,Aurora,620,52995,117.0,0-200
1629,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,123,26648,46.2,0-200
1630,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,Zamboanga Del Sur,Dumingag,298,48881,61.0,0-200


**Additional step**: Convert the contents of the percentiles into string. Not doing so will not be read by the GEOJSON file.

In [35]:
final_df2.percentiles = final_df2.percentiles.astype(str)
final_df2

  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):
  iter(obj)  # Can iterate over it.
  len(obj)  # Has a length associated with it.
  s = iter(seq)
  for i in range(min(nitems, len(seq)))
  if nitems < len(seq):


Unnamed: 0,adm1_en,geometry,province,municipality,covid_cases,population,case_per_pop,percentiles
0,Cordillera Administrative Region,(POLYGON ((120.65653987700011 17.6457320440000...,Abra,Danglas,126,4074,309.3,201-400
1,Cordillera Administrative Region,POLYGON ((120.72760314700008 17.66080787500004...,Abra,Dolores,399,11512,346.6,201-400
2,Cordillera Administrative Region,"POLYGON ((120.6814333860001 17.39223621900004,...",Abra,Luba,232,6518,355.9,201-400
3,Cordillera Administrative Region,POLYGON ((120.70875978400011 17.71055038000003...,Abra,La Paz,261,16493,158.2,0-200
4,Cordillera Administrative Region,POLYGON ((120.6087521930001 17.625961226000072...,Abra,Bangued,1082,50382,214.8,201-400
...,...,...,...,...,...,...,...,...
1627,Region IX,(POLYGON ((123.32952880800008 7.36853027300003...,Zamboanga Del Sur,Pitogo,122,27516,44.3,0-200
1628,Region IX,POLYGON ((123.54540352700008 7.999042532000033...,Zamboanga Del Sur,Aurora,620,52995,117.0,0-200
1629,Region IX,(POLYGON ((123.45762937900008 7.61576258600007...,Zamboanga Del Sur,San Pablo,123,26648,46.2,0-200
1630,Region IX,POLYGON ((123.27956563500004 8.236521383000024...,Zamboanga Del Sur,Dumingag,298,48881,61.0,0-200


# Convert 'geometry' column back to a geopandas df

We lost that when we ran fuzzy pandas.

In [36]:
final_df2 = gpd.GeoDataFrame(final_df2, crs="EPSG:4326", geometry=final_df2.geometry)

# Save as GEOJSON file

In [37]:
final_df2.to_file('municipal.geojson', driver='GeoJSON')

  pd.Int64Index,


# Simplified file

So we were successful in combining geometry files with our dataset, but the file is too big. We, therefore, use [mapshaper](https://mapshaper.org/) to simplify the precision of the map so that we have a smaller map size.

Below is the simplified json file. 

In [38]:
simplified = gpd.read_file('municipal.json')
simplified

Unnamed: 0,adm1_en,province,municipality,covid_cases,population,case_per_pop,percentiles,geometry
0,Cordillera Administrative Region,Abra,Danglas,126,4074,309.3,201-400,"POLYGON ((120.59757 17.67373, 120.60570 17.676..."
1,Cordillera Administrative Region,Abra,Dolores,399,11512,346.6,201-400,"POLYGON ((120.77803 17.66674, 120.76612 17.666..."
2,Cordillera Administrative Region,Abra,Luba,232,6518,355.9,201-400,"POLYGON ((120.68189 17.39219, 120.67767 17.391..."
3,Cordillera Administrative Region,Abra,La Paz,261,16493,158.2,0-200,"POLYGON ((120.70876 17.71055, 120.70434 17.710..."
4,Cordillera Administrative Region,Abra,Bangued,1082,50382,214.8,201-400,"POLYGON ((120.61921 17.62751, 120.61649 17.625..."
...,...,...,...,...,...,...,...,...
1627,Region IX,Zamboanga Del Sur,Pitogo,122,27516,44.3,0-200,"MULTIPOLYGON (((123.32876 7.37492, 123.32776 7..."
1628,Region IX,Zamboanga Del Sur,Aurora,620,52995,117.0,0-200,"POLYGON ((123.64055 7.98886, 123.63926 7.98924..."
1629,Region IX,Zamboanga Del Sur,San Pablo,123,26648,46.2,0-200,"MULTIPOLYGON (((123.45763 7.61576, 123.45682 7..."
1630,Region IX,Zamboanga Del Sur,Dumingag,298,48881,61.0,0-200,"POLYGON ((123.38109 8.22467, 123.37773 8.22477..."


## Convert to GEOJSON

In [39]:
simplified.to_file('simplified_municipalities.geojson', driver='GeoJSON')

  pd.Int64Index,


In [40]:
simplified

Unnamed: 0,adm1_en,province,municipality,covid_cases,population,case_per_pop,percentiles,geometry
0,Cordillera Administrative Region,Abra,Danglas,126,4074,309.3,201-400,"POLYGON ((120.59757 17.67373, 120.60570 17.676..."
1,Cordillera Administrative Region,Abra,Dolores,399,11512,346.6,201-400,"POLYGON ((120.77803 17.66674, 120.76612 17.666..."
2,Cordillera Administrative Region,Abra,Luba,232,6518,355.9,201-400,"POLYGON ((120.68189 17.39219, 120.67767 17.391..."
3,Cordillera Administrative Region,Abra,La Paz,261,16493,158.2,0-200,"POLYGON ((120.70876 17.71055, 120.70434 17.710..."
4,Cordillera Administrative Region,Abra,Bangued,1082,50382,214.8,201-400,"POLYGON ((120.61921 17.62751, 120.61649 17.625..."
...,...,...,...,...,...,...,...,...
1627,Region IX,Zamboanga Del Sur,Pitogo,122,27516,44.3,0-200,"MULTIPOLYGON (((123.32876 7.37492, 123.32776 7..."
1628,Region IX,Zamboanga Del Sur,Aurora,620,52995,117.0,0-200,"POLYGON ((123.64055 7.98886, 123.63926 7.98924..."
1629,Region IX,Zamboanga Del Sur,San Pablo,123,26648,46.2,0-200,"MULTIPOLYGON (((123.45763 7.61576, 123.45682 7..."
1630,Region IX,Zamboanga Del Sur,Dumingag,298,48881,61.0,0-200,"POLYGON ((123.38109 8.22467, 123.37773 8.22477..."
