## AFRICA Air Quality Archive Vanderbijlpark 

In [4]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:

def wrangle(filename):
    #We read the filename into a dataframe and passing the ';' as a delimiter
    df = pd.read_csv(filename,delimiter=';')
    # We convert our timestamp column of object datatype to a timestamp datatype
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # We set the index of the dataframe to be the timestamp 
    df.set_index('timestamp',inplace=True)
    #We  convert the timestamp to the 'Africa/Johannesburg' timezone since the dataset is gotten from Vanderbijlpark
    df.index = df.index.tz_convert('Africa/Johannesburg')
    #We mask our DataFrame to return value that contains the P2 value type
    mask = df['value_type'] == 'P2'
    df  = df[mask]
    ## we identify column with low and high cardinality and drop them
    low_cardinality_list = [column for column in df.columns if df[column].nunique() < 2 ]
    df.drop(low_cardinality_list,axis=1,inplace = True)
    # We resample the dataframe to return a series with the mean of the P2 value for every Hour
    y =  df['value'].resample('1H').mean()
    return y
    

In [12]:
#Returns all files that matches the pattern as specified in the glob function argument
files = glob('data/*_2021_sensor_data_archive.csv')

In [13]:
#A list comprehension for that perform the wrangle function for each file in the list created by glob funtion
list_of_df = [wrangle(file) for file in files]
# We concatenate the list of dataframes into one dataframe using the pd.concat method and set index to the timestamp
df = pd.concat(list_of_df)

In [15]:
df.head()

timestamp
2021-07-01 02:00:00+02:00    45.566296
2021-07-01 03:00:00+02:00    44.161481
2021-07-01 04:00:00+02:00    38.362778
2021-07-01 05:00:00+02:00    41.146667
2021-07-01 06:00:00+02:00    44.614717
Name: value, dtype: float64

In [17]:
df.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 2989 entries, 2021-07-01 02:00:00+02:00 to 2021-09-01 01:00:00+02:00
Series name: value
Non-Null Count  Dtype  
--------------  -----  
2545 non-null   float64
dtypes: float64(1)
memory usage: 46.7 KB
