In [1]:
import pandas as pd

In [2]:
def zscore(s, window, thresh, return_all=False):
    roll = s.rolling(window=window, min_periods=1, center=True)
    avg = roll.mean()
    std = roll.std(ddof=0)
    z = s.sub(avg).div(std)   
    m = z.between(-thresh, thresh)
    
    if return_all:
        return z, avg, std, m
    return s.where(m, avg)

In [3]:
df = pd.read_csv("../data/processed/combined.csv")
df.head()

Unnamed: 0,YYYYMMDD,HH,MWD,MWS,MWS10,WG,T,TD,SD,GR,P,NO2,O3
0,20160101,1,200,40,40,70,74,44,0,0,10212,36.53,20.24
1,20160101,2,200,40,30,70,65,44,0,0,10219,22.74,32.12
2,20160101,3,210,30,30,50,55,42,0,0,10225,24.28,29.87
3,20160101,4,210,30,30,40,55,46,0,0,10228,22.56,27.62
4,20160101,5,110,10,10,40,21,15,0,0,10233,23.67,25.62


### Visualisation of Outliers from period of Jan 1st 2016 - Jul 31st 2018

In [4]:
# Time range of our data: from Jan 1, 2016 to December 31, 2018, at hourly intervals
start_time = pd.Timestamp('2016-01-01 00:00')
end_time = pd.Timestamp('2019-01-01 00:00')
date_range = pd.date_range(start=start_time, end=end_time, freq='h')


two_years = df.iloc[:26305]
two_years.index = date_range

vars = ['O3', 'NO2']

# Window for averaging
window = 24
# Threshold variable - the higher the threshold, the fewer outliers considered
thresh = 3

outlier_counts = {}

for var in vars:
    z, avg, std, m = zscore(two_years[var], window=window, thresh=thresh, return_all=True)
    
    # count the number of outliers
    num_outliers = (~m).sum()
    outlier_counts[var] = num_outliers
    print(f'Number of outliers in {var}: {num_outliers}')

Number of outliers in O3: 8
Number of outliers in NO2: 74
