# Cleaning Covid data for Mapbox: regions

**Background**: We use Covid-19 cases data in the Philippines from data from the health department and shapefile processed through geopandas to create an interactive map. 

**Tools**: pandas, geopandas, Mapbox

Updated: January 21, 2023

# Do your imports

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re
pd.set_option('display.max_columns', None)

# Read your CSV

In [2]:
df= pd.read_csv('regions.csv')
df

Unnamed: 0,Regions,Covid-19 cases
0,National Capital Region,1288026
1,Region IV-A,716741
2,Region III,394854
3,Region VI,212638
4,Region VII,205389
5,Region II,171107
6,Region XI,151754
7,Region I,143630
8,Cordillera Administrative Region,127074
9,Region X,111610


## Lowercase column headers

In [3]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,regions,covid-19 cases
0,National Capital Region,1288026
1,Region IV-A,716741
2,Region III,394854
3,Region VI,212638
4,Region VII,205389


In [4]:
df= df.rename(columns={"covid-19 cases": "covid_cases"})
df.head(18)

Unnamed: 0,regions,covid_cases
0,National Capital Region,1288026
1,Region IV-A,716741
2,Region III,394854
3,Region VI,212638
4,Region VII,205389
5,Region II,171107
6,Region XI,151754
7,Region I,143630
8,Cordillera Administrative Region,127074
9,Region X,111610


# Geopandas

## Read through file

In [5]:
region_shape = gpd.read_file('maps/regions.zip')
region_shape

Unnamed: 0,Shape_Leng,Shape_Area,ADM1_EN,ADM1_PCODE,ADM1_REF,ADM1ALT1EN,ADM1ALT2EN,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geometry
0,53.623497,1.050272,Bangsamoro Autonomous Region in Muslim Mindanao,PH150000000,,BARMM,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4..."
1,8.027454,1.546712,Cordillera Administrative Region,PH140000000,,CAR,,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((121.22208 18.50058, 121.22086 18.483..."
2,2.320234,0.050216,National Capital Region,PH130000000,,NCR,,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((121.03842 14.78525, 121.03876 14.785..."
3,14.995101,1.043983,Region I,PH010000000,,Ilocos Region,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ..."
4,19.139048,2.241812,Region II,PH020000000,,Cagayan Valley,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ..."
5,15.949563,1.793513,Region III,PH030000000,,Central Luzon,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((120.11687 14.76309, 120.11689 ..."
6,27.624115,1.32671,Region IV-A,PH040000000,,Calabarzon,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((122.72165 13.36485, 122.72181 ..."
7,78.804542,2.220374,Region IV-B,PH170000000,,Mimaropa,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((117.31260 7.50671, 117.31249 7..."
8,23.181441,1.196677,Region IX,PH090000000,,Zamboanga Peninsula,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((121.88379 6.69138, 121.88380 6..."
9,44.923243,1.446324,Region V,PH050000000,,Bicol Region,,Philippines (the),PH,2016-06-30,2020-05-29,,"MULTIPOLYGON (((122.98823 11.73079, 122.98824 ..."


## Clean regional names in the dataset

This is so they would match the names in the shapefile for merging later.

In [17]:
df.regions = df.regions.str.replace("NCR", "National Capital Region", regex=False)
df.regions = df.regions.str.replace("BARMM", "Bangsamoro Autonomous Region in Muslim Mindanao", regex=False)
df.regions = df.regions.str.replace("CAR", "Cordillera Administrative Region", regex=False)
df.regions = df.regions.str.replace("Cordillera Administrative RegionAGA", "Region XIII", regex=False)
df.regions = df.regions.str.replace(r'[:].*$', "", regex=True)
df.regions

0                             National Capital Region
1                                         Region IV-A
2                                          Region III
3                                           Region VI
4                                          Region VII
5                                           Region II
6                                           Region XI
7                                            Region I
8                    Cordillera Administrative Region
9                                            Region X
10                                         Region XII
11                                           Region V
12                                          Region IX
13                                        Region VIII
14                                        Region XIII
15                                        Region IV-B
16                                                ROF
17    Bangsamoro Autonomous Region in Muslim Mindanao
18                          

## Clean the shapefile

### Drop unnecessary columns from shapefile

In [6]:
region_shape = region_shape.drop(['ADM0_PCODE', 'ADM0_EN','date','validOn','validTo', 'ADM1_PCODE', 'ADM1ALT2EN', 'ADM1_REF'], axis=1)
region_shape

Unnamed: 0,Shape_Leng,Shape_Area,ADM1_EN,ADM1ALT1EN,geometry
0,53.623497,1.050272,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4..."
1,8.027454,1.546712,Cordillera Administrative Region,CAR,"POLYGON ((121.22208 18.50058, 121.22086 18.483..."
2,2.320234,0.050216,National Capital Region,NCR,"POLYGON ((121.03842 14.78525, 121.03876 14.785..."
3,14.995101,1.043983,Region I,Ilocos Region,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ..."
4,19.139048,2.241812,Region II,Cagayan Valley,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ..."
5,15.949563,1.793513,Region III,Central Luzon,"MULTIPOLYGON (((120.11687 14.76309, 120.11689 ..."
6,27.624115,1.32671,Region IV-A,Calabarzon,"MULTIPOLYGON (((122.72165 13.36485, 122.72181 ..."
7,78.804542,2.220374,Region IV-B,Mimaropa,"MULTIPOLYGON (((117.31260 7.50671, 117.31249 7..."
8,23.181441,1.196677,Region IX,Zamboanga Peninsula,"MULTIPOLYGON (((121.88379 6.69138, 121.88380 6..."
9,44.923243,1.446324,Region V,Bicol Region,"MULTIPOLYGON (((122.98823 11.73079, 122.98824 ..."


In [7]:
region_shape= region_shape.rename(columns={"ADM1_EN": "regions"})

## Merge data 

In [8]:
regions_cases = region_shape.merge(df, on='regions')
regions_cases.head()

Unnamed: 0,Shape_Leng,Shape_Area,regions,ADM1ALT1EN,geometry,covid_cases
0,53.623497,1.050272,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",27488
1,8.027454,1.546712,Cordillera Administrative Region,CAR,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",127074
2,2.320234,0.050216,National Capital Region,NCR,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1288026
3,14.995101,1.043983,Region I,Ilocos Region,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ...",143630
4,19.139048,2.241812,Region II,Cagayan Valley,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ...",171107


## Read and merge with population data

In [9]:
df2 = pd.read_excel('population.xlsx')
df2.head()

Unnamed: 0,regions,population
0,National Capital Region,12877253
1,Cordillera Administrative Region,1797660
2,Region I,5026128
3,Region II,3685744
4,Region III,12422172


In [10]:
regions_final = regions_cases.merge(df2, on='regions')
regions_final.head()

Unnamed: 0,Shape_Leng,Shape_Area,regions,ADM1ALT1EN,geometry,covid_cases,population
0,53.623497,1.050272,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",27488,4404288
1,8.027454,1.546712,Cordillera Administrative Region,CAR,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",127074,1797660
2,2.320234,0.050216,National Capital Region,NCR,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1288026,12877253
3,14.995101,1.043983,Region I,Ilocos Region,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ...",143630,5026128
4,19.139048,2.241812,Region II,Cagayan Valley,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ...",171107,3685744


## Compute for population ratio

We do this by dividing the number of Covid-19 cases to total population per region and then multiply by 100,000. That would give us cases per 100,000 people in the area.

In [11]:
regions_final ['case_per_pop'] = regions_final.covid_cases / regions_final.population * 100000
regions_final = regions_final.round(1)

## Create bins for cases

The bins will allow us to categorize the number of cases, necessary for mapping later.

In [12]:
regions_final['percentiles'] = pd.cut(np.array(regions_final['case_per_pop']),
   [0, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001, 10001, 11000], labels=["0-1000", "1001-2000", "2001-3000", "3001-4000", "4001-5000", "5001-6000", "6001-7000", "7001-8000", "8001-9000", "9001-10000", "10001-11000"])

regions_final.head()

Unnamed: 0,Shape_Leng,Shape_Area,regions,ADM1ALT1EN,geometry,covid_cases,population,case_per_pop,percentiles
0,53.6,1.1,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",27488,4404288,624.1,0-1000
1,8.0,1.5,Cordillera Administrative Region,CAR,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",127074,1797660,7068.9,7001-8000
2,2.3,0.1,National Capital Region,NCR,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1288026,12877253,10002.3,10001-11000
3,15.0,1.0,Region I,Ilocos Region,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ...",143630,5026128,2857.7,2001-3000
4,19.1,2.2,Region II,Cagayan Valley,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ...",171107,3685744,4642.4,4001-5000


In [13]:
regions_final.dtypes

Shape_Leng       float64
Shape_Area       float64
regions           object
ADM1ALT1EN        object
geometry        geometry
covid_cases        int64
population         int64
case_per_pop     float64
percentiles     category
dtype: object

**Additional step**: Convert the contents of the percentiles into string. Not doing so will not be read by the GEOJSON file.

In [14]:
regions_final.percentiles = regions_final.percentiles.astype(str)
regions_final.dtypes

Shape_Leng       float64
Shape_Area       float64
regions           object
ADM1ALT1EN        object
geometry        geometry
covid_cases        int64
population         int64
case_per_pop     float64
percentiles       object
dtype: object

# Save as GEOJSON file

In [15]:
regions_final.to_file('regions_cases.geojson', driver='GeoJSON')

# Simplified file

So we were successful in combining geometry files with our dataset, but the file is too big. We, therefore, use [mapshaper](https://mapshaper.org/) to simplify the precision of the map so that we have a smaller map size.

Below is the simplified json file. 

In [16]:
simplified_regions = gpd.read_file('regions_cases.json')
simplified_regions

Unnamed: 0,Shape_Leng,Shape_Area,regions,ADM1ALT1EN,covid_cases,population,case_per_pop,percentiles,geometry
0,53.6,1.1,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,27488,4404288,624.1,0-1000,"MULTIPOLYGON (((119.46876 4.59360, 119.46807 4..."
1,8.0,1.5,Cordillera Administrative Region,CAR,127074,1797660,7068.9,7001-8000,"POLYGON ((120.76897 16.19803, 120.77405 16.214..."
2,2.3,0.1,National Capital Region,NCR,1288026,12877253,10002.3,10001-11000,"POLYGON ((121.09951 14.76921, 121.09934 14.770..."
3,15.0,1.0,Region I,Ilocos Region,143630,5026128,2857.7,2001-3000,"MULTIPOLYGON (((119.86610 15.81258, 119.86652 ..."
4,19.1,2.2,Region II,Cagayan Valley,171107,3685744,4642.4,4001-5000,"MULTIPOLYGON (((122.46667 16.92135, 122.46617 ..."
5,15.9,1.8,Region III,Central Luzon,394854,12422172,3178.6,3001-4000,"MULTIPOLYGON (((120.11687 14.76309, 120.11656 ..."
6,27.6,1.3,Region IV-A,Calabarzon,716741,16195042,4425.7,4001-5000,"MULTIPOLYGON (((122.72165 13.36485, 122.72125 ..."
7,78.8,2.2,Region IV-B,Mimaropa,47357,3228558,1466.8,1001-2000,"MULTIPOLYGON (((117.31260 7.50671, 117.31306 7..."
8,23.2,1.2,Region IX,Zamboanga Peninsula,69546,3875576,1794.5,1001-2000,"MULTIPOLYGON (((121.88379 6.69138, 121.88295 6..."
9,44.9,1.4,Region V,Bicol Region,71735,6082165,1179.4,1001-2000,"MULTIPOLYGON (((122.98823 11.73079, 122.98763 ..."


## Convert to GEOJSON

In [17]:
simplified_regions.to_file('simplified_regions.geojson', driver='GeoJSON')

## Fix the CSV file of cases for uploading to site

In [22]:
simplified_regions = simplified_regions.drop(['Shape_Leng', 'Shape_Area', 'population', 'percentiles', 'geometry'], axis=1)

KeyError: "['Shape_Leng', 'Shape_Area', 'population', 'percentiles', 'geometry'] not found in axis"

In [23]:
simplified_regions

Unnamed: 0,regions,ADM1ALT1EN,covid_cases,case_per_pop
0,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,27488,624.1
1,Cordillera Administrative Region,CAR,127074,7068.9
2,National Capital Region,NCR,1288026,10002.3
3,Region I,Ilocos Region,143630,2857.7
4,Region II,Cagayan Valley,171107,4642.4
5,Region III,Central Luzon,394854,3178.6
6,Region IV-A,Calabarzon,716741,4425.7
7,Region IV-B,Mimaropa,47357,1466.8
8,Region IX,Zamboanga Peninsula,69546,1794.5
9,Region V,Bicol Region,71735,1179.4


In [24]:
simplified_regions.columns =['Regions', 'Region name', 'Covid-19 cases', 'Case per population']
simplified_regions

Unnamed: 0,Regions,Region name,Covid-19 cases,Case per population
0,Bangsamoro Autonomous Region in Muslim Mindanao,BARMM,27488,624.1
1,Cordillera Administrative Region,CAR,127074,7068.9
2,National Capital Region,NCR,1288026,10002.3
3,Region I,Ilocos Region,143630,2857.7
4,Region II,Cagayan Valley,171107,4642.4
5,Region III,Central Luzon,394854,3178.6
6,Region IV-A,Calabarzon,716741,4425.7
7,Region IV-B,Mimaropa,47357,1466.8
8,Region IX,Zamboanga Peninsula,69546,1794.5
9,Region V,Bicol Region,71735,1179.4


In [25]:
simplified_regions.to_csv('regions.csv', index=False)