# Cleaning Covid data for Mapbox: regions

**Background**: We use Covid-19 cases data in the Philippines from data from the health department and shapefile processed through geopandas to create an interactive map. 

**Tools**: pandas, geopandas, Mapbox

Updated: June 18, 2022

# Do your imports

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re
pd.set_option('display.max_columns', None)

# Read your CSV

In [2]:
df= pd.read_csv('regions.csv')
df

Unnamed: 0.1,Unnamed: 0,RegionRes
0,NCR,1176864
1,Region IV-A: CALABARZON,657098
2,Region III: Central Luzon,362611
3,Region VI: Western Visayas,195390
4,Region VII: Central Visayas,193507
5,Region II: Cagayan Valley,162455
6,Region XI: Davao Region,140815
7,Region I: Ilocos Region,133881
8,CAR,119251
9,Region X: Northern Mindanao,106284


## Lowercase column headers

In [3]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,unnamed: 0,regionres
0,NCR,1176864
1,Region IV-A: CALABARZON,657098
2,Region III: Central Luzon,362611
3,Region VI: Western Visayas,195390
4,Region VII: Central Visayas,193507


In [4]:
df= df.rename(columns={"unnamed: 0": "regions"})
df= df.rename(columns={"regionres": "covid_cases"})
df.head()

Unnamed: 0,regions,covid_cases
0,NCR,1176864
1,Region IV-A: CALABARZON,657098
2,Region III: Central Luzon,362611
3,Region VI: Western Visayas,195390
4,Region VII: Central Visayas,193507


# Geopandas

## Read through file

In [5]:
region_shape = gpd.read_file('regions.zip')
region_shape

Unnamed: 0,ADM1_EN,ADM1_PCODE,ADM_ID,UPDATED,PERIMETER,AREA,LONGITUDE,LATITUDE,geometry
0,Region VII,PH070000000,70000000,2019-12-31,3205930.229,14293660000.0,123.615,9.921,"MULTIPOLYGON (((123.11764 9.64950, 123.11765 9..."
1,Region VI,PH060000000,60000000,2019-12-31,3033678.117,20042120000.0,122.651,10.844,"MULTIPOLYGON (((122.36748 9.83003, 122.36750 9..."
2,Bangsamoro Autonomous Region in Muslim Mindanao,PH150000000,150000000,2019-12-31,5932722.5,12834190000.0,123.367,6.947,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4..."
3,Cordillera Administrative Region,PH140000000,140000000,2019-12-31,873012.878,18192640000.0,121.039,17.355,"POLYGON ((121.22208 18.50058, 121.22086 18.483..."
4,National Capital Region,PH130000000,130000000,2019-12-31,253627.782,598650100.0,121.032,14.606,"POLYGON ((121.03842 14.78525, 121.03876 14.785..."
5,Region I,PH010000000,10000000,2019-12-31,1632853.395,12307350000.0,120.484,16.907,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ..."
6,Region II,PH020000000,20000000,2019-12-31,2071974.767,26387730000.0,121.732,17.207,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ..."
7,Region III,PH030000000,30000000,2019-12-31,1739247.236,21304160000.0,120.823,15.392,"MULTIPOLYGON (((120.11687 14.76309, 120.11689 ..."
8,Region IV-A,PH040000000,40000000,2019-12-31,3019032.066,15846520000.0,121.567,14.162,"MULTIPOLYGON (((122.72165 13.36485, 122.72181 ..."
9,Region IV-B,PH170000000,170000000,2019-12-31,8661995.346,26797820000.0,119.906,11.359,"MULTIPOLYGON (((117.31260 7.50671, 117.31249 7..."


## Clean regional names in the dataset

This is so they would match the names in the shapefile for merging later.

In [6]:
df.regions = df.regions.str.replace("NCR", "National Capital Region", regex=False)
df.regions = df.regions.str.replace("BARMM", "Bangsamoro Autonomous Region in Muslim Mindanao", regex=False)
df.regions = df.regions.str.replace("CAR", "Cordillera Administrative Region", regex=False)
df.regions = df.regions.str.replace("Cordillera Administrative RegionAGA", "Region XIII", regex=False)
df.regions = df.regions.str.replace(r'[:].*$', "", regex=True)
df.regions

0                             National Capital Region
1                                         Region IV-A
2                                          Region III
3                                           Region VI
4                                          Region VII
5                                           Region II
6                                           Region XI
7                                            Region I
8                    Cordillera Administrative Region
9                                            Region X
10                                         Region XII
11                                          Region IX
12                                           Region V
13                                        Region VIII
14                                        Region XIII
15                                        Region IV-B
16                                                ROF
17    Bangsamoro Autonomous Region in Muslim Mindanao
18                          

## Clean the shapefile data

### Lowercase headers

In [7]:
region_shape.columns = region_shape.columns.str.lower()

### Drop unnecessary columns from shapefile

In [8]:
region_shape = region_shape.drop(['updated', 'adm_id','adm1_pcode', 'area', 'perimeter', 'latitude', 'longitude'], axis=1)
region_shape

Unnamed: 0,adm1_en,geometry
0,Region VII,"MULTIPOLYGON (((123.11764 9.64950, 123.11765 9..."
1,Region VI,"MULTIPOLYGON (((122.36748 9.83003, 122.36750 9..."
2,Bangsamoro Autonomous Region in Muslim Mindanao,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4..."
3,Cordillera Administrative Region,"POLYGON ((121.22208 18.50058, 121.22086 18.483..."
4,National Capital Region,"POLYGON ((121.03842 14.78525, 121.03876 14.785..."
5,Region I,"MULTIPOLYGON (((119.86596 15.81539, 119.86597 ..."
6,Region II,"MULTIPOLYGON (((122.46667 16.92135, 122.46674 ..."
7,Region III,"MULTIPOLYGON (((120.11687 14.76309, 120.11689 ..."
8,Region IV-A,"MULTIPOLYGON (((122.72165 13.36485, 122.72181 ..."
9,Region IV-B,"MULTIPOLYGON (((117.31260 7.50671, 117.31249 7..."


In [9]:
region_shape= region_shape.rename(columns={"adm1_en": "regions"})

## Merge data 

In [10]:
regions_cases = region_shape.merge(df, on='regions')
regions_cases.head()

Unnamed: 0,regions,geometry,covid_cases
0,Region VII,"MULTIPOLYGON (((123.11764 9.64950, 123.11765 9...",193507
1,Region VI,"MULTIPOLYGON (((122.36748 9.83003, 122.36750 9...",195390
2,Bangsamoro Autonomous Region in Muslim Mindanao,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",25895
3,Cordillera Administrative Region,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",119251
4,National Capital Region,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1176864


## Read and merge with population data

In [11]:
df2 = pd.read_excel('population.xlsx')
df2.head()

Unnamed: 0,regions,population
0,National Capital Region,12877253
1,Cordillera Administrative Region,1797660
2,Region I,5026128
3,Region II,3685744
4,Region III,12422172


In [12]:
regions_final = regions_cases.merge(df2, on='regions')
regions_final.head()

Unnamed: 0,regions,geometry,covid_cases,population
0,Region VII,"MULTIPOLYGON (((123.11764 9.64950, 123.11765 9...",193507,8081988
1,Region VI,"MULTIPOLYGON (((122.36748 9.83003, 122.36750 9...",195390,7536383
2,Bangsamoro Autonomous Region in Muslim Mindanao,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",25895,4404288
3,Cordillera Administrative Region,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",119251,1797660
4,National Capital Region,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1176864,12877253


## Compute for population ratio

We do this by dividing the number of Covid-19 cases to total population per region and then multiply by 100,000. That would give us cases per 100,000 people in the area.

In [13]:
regions_final ['case_per_pop'] = regions_final.covid_cases / regions_final.population * 100000
regions_final = regions_final.round(1)

## Create bins for cases

The bins will allow us to categorize the number of cases, necessary for mapping later.

In [14]:
regions_final['percentiles'] = pd.cut(np.array(regions_final['case_per_pop']),
   [0, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001, 10000], labels=["0-1000", "1001-2000", "2001-3000", "3001-4000", "4001-5000", "5001-6000", "6001-7000", "7001-8000", "8001-9000", "9001-10000"])

regions_final.head()

Unnamed: 0,regions,geometry,covid_cases,population,case_per_pop,percentiles
0,Region VII,"MULTIPOLYGON (((123.11764 9.64950, 123.11765 9...",193507,8081988,2394.3,2001-3000
1,Region VI,"MULTIPOLYGON (((122.36748 9.83003, 122.36750 9...",195390,7536383,2592.6,2001-3000
2,Bangsamoro Autonomous Region in Muslim Mindanao,"MULTIPOLYGON (((119.46876 4.59360, 119.46881 4...",25895,4404288,587.9,0-1000
3,Cordillera Administrative Region,"POLYGON ((121.22208 18.50058, 121.22086 18.483...",119251,1797660,6633.7,6001-7000
4,National Capital Region,"POLYGON ((121.03842 14.78525, 121.03876 14.785...",1176864,12877253,9139.1,9001-10000


In [15]:
regions_final.dtypes

regions           object
geometry        geometry
covid_cases        int64
population         int64
case_per_pop     float64
percentiles     category
dtype: object

**Additional step**: Convert the contents of the percentiles into string. Not doing so will not be read by the GEOJSON file.

In [16]:
regions_final.percentiles = regions_final.percentiles.astype(str)
regions_final.dtypes

regions           object
geometry        geometry
covid_cases        int64
population         int64
case_per_pop     float64
percentiles       object
dtype: object

# Save as GEOJSON file

In [17]:
regions_final.to_file('regions_cases.geojson', driver='GeoJSON')

  pd.Int64Index,


# Simplified file

So we were successful in combining geometry files with our dataset, but the file is too big. We, therefore, use [mapshaper](https://mapshaper.org/) to simplify the precision of the map so that we have a smaller map size.

Below is the simplified json file. 

In [21]:
simplified_regions = gpd.read_file('regions_cases.json')
simplified_regions

Unnamed: 0,regions,covid_cases,population,case_per_pop,percentiles,geometry
0,Region VII,193507,8081988,2394.3,2001-3000,"MULTIPOLYGON (((123.27039 9.08460, 123.26843 9..."
1,Region VI,195390,7536383,2592.6,2001-3000,"MULTIPOLYGON (((122.36748 9.83003, 122.36521 9..."
2,Bangsamoro Autonomous Region in Muslim Mindanao,25895,4404288,587.9,0-1000,"MULTIPOLYGON (((119.46876 4.59360, 119.46798 4..."
3,Cordillera Administrative Region,119251,1797660,6633.7,6001-7000,"POLYGON ((120.76897 16.19803, 120.77448 16.216..."
4,National Capital Region,1176864,12877253,9139.1,9001-10000,"POLYGON ((121.09951 14.76921, 121.09936 14.771..."
5,Region I,133881,5026128,2663.7,2001-3000,"MULTIPOLYGON (((119.86610 15.81258, 119.86682 ..."
6,Region II,162455,3685744,4407.7,4001-5000,"MULTIPOLYGON (((122.46667 16.92135, 122.46723 ..."
7,Region III,362611,12422172,2919.1,2001-3000,"MULTIPOLYGON (((120.11687 14.76309, 120.11450 ..."
8,Region IV-A,657098,16195042,4057.4,4001-5000,"MULTIPOLYGON (((122.72165 13.36485, 122.71908 ..."
9,Region IV-B,44305,3228558,1372.3,1001-2000,"MULTIPOLYGON (((117.31260 7.50671, 117.31386 7..."


## Convert to GEOJSON

In [22]:
simplified_regions.to_file('simplified_regions.geojson', driver='GeoJSON')

  pd.Int64Index,


In [23]:
simplified_regions

Unnamed: 0,regions,covid_cases,population,case_per_pop,percentiles,geometry
0,Region VII,193507,8081988,2394.3,2001-3000,"MULTIPOLYGON (((123.27039 9.08460, 123.26843 9..."
1,Region VI,195390,7536383,2592.6,2001-3000,"MULTIPOLYGON (((122.36748 9.83003, 122.36521 9..."
2,Bangsamoro Autonomous Region in Muslim Mindanao,25895,4404288,587.9,0-1000,"MULTIPOLYGON (((119.46876 4.59360, 119.46798 4..."
3,Cordillera Administrative Region,119251,1797660,6633.7,6001-7000,"POLYGON ((120.76897 16.19803, 120.77448 16.216..."
4,National Capital Region,1176864,12877253,9139.1,9001-10000,"POLYGON ((121.09951 14.76921, 121.09936 14.771..."
5,Region I,133881,5026128,2663.7,2001-3000,"MULTIPOLYGON (((119.86610 15.81258, 119.86682 ..."
6,Region II,162455,3685744,4407.7,4001-5000,"MULTIPOLYGON (((122.46667 16.92135, 122.46723 ..."
7,Region III,362611,12422172,2919.1,2001-3000,"MULTIPOLYGON (((120.11687 14.76309, 120.11450 ..."
8,Region IV-A,657098,16195042,4057.4,4001-5000,"MULTIPOLYGON (((122.72165 13.36485, 122.71908 ..."
9,Region IV-B,44305,3228558,1372.3,1001-2000,"MULTIPOLYGON (((117.31260 7.50671, 117.31386 7..."
