# Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd
import os
import time

from scipy import stats

## Ensuring reproducibility

In [2]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

## Read Dataset

In [3]:
path = '../dataset/Air quality/'
names = ["No",  "year",  "month",  "day",  "hour",  "PM2.5",  "PM10",  "SO2",  "NO2",  "CO",  "O3",  "TEMP",  "PRES",  "DEWP",  "RAIN",  "wd",  "WSPM",  "station"]
dataset = pd.read_csv(path + 'Air quality.csv',  names=names)
dataset

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,5844,2013,10,30,11,41,49,19,51.1197,700,6,13.2,1022.4,-0.1,0,NE,1.4,Wanliu
1,27824,2016,5,3,7,15,26,2,11,300,72,15.5,993.5,-1.1,0,NW,3.7,Dingling
2,25841,2016,2,10,16,95,95,59,46,3100,61,9.3,1012.4,-12.6,0,ESE,1.4,Shunyi
3,26986,2016,3,29,9,10,36,12,34,500,45,15.7,1014.6,-6,0,WNW,0.8,Aotizhongxin
4,23848,2015,11,19,15,49,49,2,40,1700,2,4.6,1019.2,4,0.3,W,0.9,Wanliu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378697,7768,2014,1,18,15,105,112,38,77,1600,43,4.6,1027.4,-13.8,0,SSE,2.3,Nongzhanguan
378698,19377,2015,5,17,8,123,139,15,33,900,68,20.4,996.6,15,0,S,2.2,Dongsi
378699,17453,2015,2,26,4,8,12,2,11,400,,-1.1,1021.1,-20.2,0,N,2.2,Aotizhongxin
378700,4995,2013,9,25,2,3,8,2,24,200,46,9.2,1019,2.5,0,W,0.9,Nongzhanguan


## Dataset Preparation

### Convert Column Types

In [4]:
# numeric columns

int_columns = ["No", "year", "month", "day", "hour"]
float_columns = ["PM2.5", "PM10", "SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM"]


# utility function

def get_corrupted_rows(rows, expected_type):
    corrupted_rows = []
    for i, e in enumerate(rows):
        try:
            expected_type(e)
        except ValueError:
#             print(i, e, "is not type", expected_type)
            corrupted_rows.append(i)
    return corrupted_rows

In [5]:
# drop unnecessary rows

problematic_rows = []

for col in int_columns:
    problematic_rows += get_corrupted_rows(dataset[col].values, int)

for col in float_columns:
    problematic_rows += get_corrupted_rows(dataset[col].values, float)

problematic_rows = list(set(problematic_rows))

temp = dataset.drop(problematic_rows)

In [6]:
# convert columns
temp[int_columns + float_columns] = temp[int_columns + float_columns].apply(pd.to_numeric)
temp.dtypes

No           int64
year         int64
month        int64
day          int64
hour         int64
PM2.5      float64
PM10       float64
SO2        float64
NO2        float64
CO         float64
O3         float64
TEMP       float64
PRES       float64
DEWP       float64
RAIN       float64
wd          object
WSPM       float64
station     object
dtype: object

### Drop NaN Rows

In [7]:
df = temp.dropna()
df.isna().sum()

No         0
year       0
month      0
day        0
hour       0
PM2.5      0
PM10       0
SO2        0
NO2        0
CO         0
O3         0
TEMP       0
PRES       0
DEWP       0
RAIN       0
wd         0
WSPM       0
station    0
dtype: int64

## Statistical Analaysis

### Data Summary

In [14]:
# Data summary

num_rel_cols = ["PM10",  "SO2",  "NO2",  "CO",  "O3",  "TEMP",  "PRES",  "DEWP",  "RAIN",  "WSPM"]

df[num_rel_cols].describe()

Unnamed: 0,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
count,343946.0,343946.0,343946.0,343946.0,343946.0,343946.0,343946.0,343946.0,343946.0,343946.0
mean,104.555073,15.62887,50.559578,1229.810223,57.381166,13.511584,1010.823904,2.40822,0.064669,1.738904
std,91.402798,21.277936,35.054046,1157.956986,56.750535,11.427348,10.450925,13.796311,0.817904,1.241104
min,2.0,0.2856,2.0,100.0,0.2142,-19.5,982.4,-36.0,0.0,0.0
25%,36.0,2.0,23.0,500.0,10.71,3.1,1002.4,-9.0,0.0,0.9
50%,82.0,7.0,43.0,900.0,45.0,14.4,1010.4,2.9,0.0,1.4
75%,145.0,19.0,71.0,1500.0,82.0,23.2,1019.0,15.1,0.0,2.2
max,999.0,500.0,290.0,10000.0,1071.0,41.6,1042.8,29.1,72.5,13.2


### Outliers

In [23]:
# Detecting Outliers

z = np.abs(stats.zscore(df["PM10"]))
print(z)
threshold = 3
print(np.where(z > 3))
print(min(z))

[0.60780584 0.85943963 0.10453824 ... 0.08145196 0.37684816 1.05637043]
(array([    24,    109,    172, ..., 343747, 343935, 343942], dtype=int64),)
0.001696596810619773


### Correlation Matrix

In [8]:
# Correlation matrix
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_properties(**{'font-size': '0pt'})

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
No,1.0,0.968621,0.0577453,0.0172404,-0.00420439,-0.0258543,-0.0492989,-0.227335,-0.0416524,0.0265146,-0.0488205,-0.133749,0.193144,-0.109812,0.0056649,0.05614
year,0.968621,1.0,-0.191324,-0.00801635,-0.00418246,-0.0293123,-0.0440205,-0.168341,-0.0518295,0.0135783,-0.022777,-0.16758,0.19441,-0.178349,0.00126286,0.0905682
month,0.0577453,-0.191324,1.0,0.0171395,-0.0023292,0.0149168,-0.0201996,-0.223596,0.0419795,0.0509066,-0.100647,0.146925,-0.0225611,0.284126,0.017794,-0.141689
day,0.0172404,-0.00801635,0.0171395,1.0,-0.00285356,0.00389196,0.0248852,-0.0068697,0.0135993,-0.0173503,-0.00121741,0.0124026,0.0221729,0.0242707,-0.00302744,-0.0136815
hour,-0.00420439,-0.00418246,-0.0023292,-0.00285356,1.0,0.0131569,0.0517783,0.0328289,-0.00288918,-0.0301269,0.281898,0.139899,-0.0390663,-0.0119977,0.0139432,0.137741
PM2.5,-0.0258543,-0.0293123,0.0149168,0.00389196,0.0131569,1.0,0.883953,0.482351,0.671518,0.792335,-0.15089,-0.128647,0.0137102,0.118416,-0.0155953,-0.275413
PM10,-0.0492989,-0.0440205,-0.0201996,0.0248852,0.0517783,0.883953,1.0,0.466033,0.653632,0.703076,-0.113694,-0.0939004,-0.022269,0.0733386,-0.0271223,-0.185566
SO2,-0.227335,-0.168341,-0.223596,-0.0068697,0.0328289,0.482351,0.466033,1.0,0.501735,0.53413,-0.166322,-0.320291,0.218457,-0.265807,-0.040686,-0.109034
NO2,-0.0416524,-0.0518295,0.0419795,0.0135993,-0.00288918,0.671518,0.653632,0.501735,1.0,0.705817,-0.476151,-0.276049,0.171903,-0.027953,-0.044238,-0.402517
CO,0.0265146,0.0135783,0.0509066,-0.0173503,-0.0301269,0.792335,0.703076,0.53413,0.705817,1.0,-0.314488,-0.322884,0.183296,-0.0536824,-0.0136076,-0.297464
