# Data Wrangling — ISPU Series

In [193]:
import pandas as pd
import numpy as np

## ISPU 2020

### Setup

In [218]:
# load ispu_2020.csv
ispu_2020 = pd.read_csv('ispu_2020.csv', sep=';')
ispu_2020

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku
0,202005,01/05/20,55,26,21,83,10,83,O3,SEDANG,DKI5
1,202005,02/05/20,36,22,12,80,9,80,O3,SEDANG,DKI5
2,202005,03/05/20,65,34,35,49,11,65,PM10,SEDANG,DKI5
3,202005,04/05/20,53,21,14,75,8,75,O3,SEDANG,DKI5
4,202005,05/05/20,53,22,18,73,9,73,O3,SEDANG,DKI5
...,...,...,...,...,...,...,...,...,...,...,...
361,202010,27/10/20,52,54,17,34,30,70,PM25,SEDANG,DKI4
362,202010,28/10/20,66,54,29,32,33,89,PM25,SEDANG,DKI5
363,202010,29/10/20,57,55,15,35,26,87,PM25,SEDANG,DKI4
364,202010,30/10/20,63,55,18,34,19,108,PM25,TIDAK SEHAT,DKI4


### EDAs

In [219]:
# checking data types of each column

ispu_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   periode_data  366 non-null    int64 
 1   tanggal       366 non-null    object
 2   pm10          366 non-null    int64 
 3   so2           366 non-null    object
 4   co            366 non-null    object
 5   o3            366 non-null    int64 
 6   no2           366 non-null    int64 
 7   max           366 non-null    int64 
 8   critical      365 non-null    object
 9   categori      366 non-null    object
 10  lokasi_spku   366 non-null    object
dtypes: int64(5), object(6)
memory usage: 31.6+ KB


In [220]:
# list of all columns with empty data, sorted by descending

ispu_2020.isna().sum()

periode_data    0
tanggal         0
pm10            0
so2             0
co              0
o3              0
no2             0
max             0
critical        1
categori        0
lokasi_spku     0
dtype: int64

In [221]:
ispu_2020['co'].unique()

array(['21', '12', '35', '14', '18', '22', '27', '20', '17', '15', '16',
       '26', '36', '28', '19', '29', '10', '13', '9', '11', '25', '39',
       '58', '41', '40', '71', '43', '31', '34', '51', '42', '---', '32',
       '52', '38', '46', '59', '63', '62', '33', '5', '6', '23', '37',
       '24', '53', '48', '50', '57', '30', '55', '75', '8', '7'],
      dtype=object)

In [222]:
ispu_2020[ispu_2020.eq('---').any(axis=1)]

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku
84,202003,02/03/20,22,---,35,213,11,213,4,O3,SANGAT TIDAK SEHAT
85,202003,03/03/20,22,---,---,201,15,201,4,O3,SANGAT TIDAK SEHAT
86,202003,04/03/20,54,---,39,152,13,152,4,O3,TIDAK SEHAT
87,202003,05/03/20,59,---,32,206,5,206,4,O3,SANGAT TIDAK SEHAT
102,202003,20/03/20,69,---,33,99,7,99,4,O3,SEDANG
103,202003,21/03/20,51,---,11,83,8,83,4,O3,SEDANG
104,202003,22/03/20,42,---,15,84,6,84,4,O3,SEDANG
105,202003,23/03/20,27,---,5,71,7,71,4,O3,SEDANG
106,202003,24/03/20,52,---,14,105,7,105,4,O3,TIDAK SEHAT
107,202003,25/03/20,70,---,25,93,11,93,4,O3,SEDANG


In [223]:
ispu_2020['critical'].unique()

array(['O3', 'PM10', 'PM25', '4', 'CO', 'SO2', nan], dtype=object)

In [224]:
ispu_2020['categori'].unique()

array(['SEDANG', 'TIDAK SEHAT', 'BAIK', 'O3'], dtype=object)

In [225]:
ispu_2020[ispu_2020['categori'] == 'BAIK']

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku
24,202005,25/05/20,50,25,15,44,9,50,PM10,BAIK,DKI4
53,202001,01/01/20,38,36,25,46,9,46,O3,BAIK,DKI5
62,202001,10/01/20,44,37,27,47,9,47,O3,BAIK,DKI5
64,202001,12/01/20,43,38,17,46,5,46,O3,BAIK,DKI5
66,202001,14/01/20,40,32,17,48,9,48,O3,BAIK,DKI4
163,202006,03/06/20,47,21,14,48,11,48,O3,BAIK,DKI5
223,202002,09/02/20,48,30,35,50,12,50,O3,BAIK,DKI5


In [226]:
ispu_2020[ispu_2020['categori'] == 'O3']

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku
84,202003,02/03/20,22,---,35,213,11,213,4,O3,SANGAT TIDAK SEHAT
85,202003,03/03/20,22,---,---,201,15,201,4,O3,SANGAT TIDAK SEHAT
86,202003,04/03/20,54,---,39,152,13,152,4,O3,TIDAK SEHAT
87,202003,05/03/20,59,---,32,206,5,206,4,O3,SANGAT TIDAK SEHAT
88,202003,06/03/20,24,17,52,95,3,95,4,O3,SEDANG
89,202003,07/03/20,34,27,40,54,7,54,4,O3,SEDANG
90,202003,08/03/20,30,24,40,58,5,58,4,O3,SEDANG
91,202003,09/03/20,52,22,38,59,7,59,4,O3,SEDANG
92,202003,10/03/20,32,21,39,43,4,43,4,O3,BAIK
93,202003,11/03/20,25,20,36,41,4,41,4,O3,BAIK


In [227]:
ispu_2020[(ispu_2020['categori'] == 'SEDANG') & 
          (ispu_2020['lokasi_spku'].isin(['TIDAK SEHAT', 'SEDANG', 'BAIK', 'SANGAT TIDAK SEHAT']))]

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku


In [228]:
ispu_2020[(ispu_2020['categori'] == 'SANGAT TIDAK SEHAT')]

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku


In [229]:
ispu_2020[ispu_2020.duplicated()]

Unnamed: 0,periode_data,tanggal,pm10,so2,co,o3,no2,max,critical,categori,lokasi_spku


### Steps

After completing some EDAs, there are a few anomalies:

- `periode_data` can be deleted since we have `tanggal`
- `tanggal` should be in datetime format
- `so2` and `co` are not in numeric format
- Several `lokasi_spku` rows contain non-location-related values
- There is a single `NaN` in the `critical` column
- Several `so2` rows contain `---` which prevents the column from being numeric

Changes needed:

- Delete `periode_data`
- Convert `tanggal` to datetime format
- Remove rows with anomalous values in `lokasi_spku` since they are not recoverable (including rows with `---` values in the `so2` column)
- Remove the row with the `NaN` value in the `critical` column

In [230]:
# delete periode_data
if 'periode_data' in ispu_2020.columns:
    ispu_2020.drop('periode_data', axis=1, inplace=True)

# convert tanggal to datetime format
is_datetime = pd.api.types.is_datetime64_any_dtype(ispu_2020['tanggal'])
if not is_datetime:
    ispu_2020['tanggal'] = pd.to_datetime(ispu_2020['tanggal'], format='%d/%m/%y', dayfirst=True)

# Remove rows with anomalous values in `lokasi_spku` since they are not recoverable (including rows with `---` values in the `so2` column)
if 'O3' in ispu_2020['categori'].unique():
    index_to_drop = ispu_2020[ispu_2020['categori'] == 'O3'].index
    ispu_2020.drop(index=index_to_drop, inplace=True)
    ispu_2020.reset_index(drop=True, inplace=True)

# Remove the row with the `NaN` value in the `critical` column
ispu_2020.dropna(inplace=True, ignore_index=True)

# Convert column so2 and co to int64
ispu_2020['so2'] = ispu_2020['so2'].astype(int)
ispu_2020['co'] = ispu_2020['co'].astype(int)

# Rename categori to category for aestethic purposes
ispu_2020.rename(columns={'categori': 'category'}, inplace=True)

ispu_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   tanggal      334 non-null    datetime64[ns]
 1   pm10         334 non-null    int64         
 2   so2          334 non-null    int64         
 3   co           334 non-null    int64         
 4   o3           334 non-null    int64         
 5   no2          334 non-null    int64         
 6   max          334 non-null    int64         
 7   critical     334 non-null    object        
 8   category     334 non-null    object        
 9   lokasi_spku  334 non-null    object        
dtypes: datetime64[ns](1), int64(6), object(3)
memory usage: 26.2+ KB


In [232]:
ispu_2020.to_csv('ispu_2020_cleaned.csv')