In [None]:
import pandas as pd

In [None]:
def clean_dataframe(df, registry, start, end, verbose=False):

    df['Date'] = pd.to_datetime(df['Date'])

    registry['DateStop'] = pd.to_datetime(registry['DateStop'])

    data = df[(df.Date > pd.to_datetime(start)) & (df.Date < pd.to_datetime(end))]

    active = registry[(pd.isna(registry.DateStop)) | (registry.DateStop > pd.to_datetime(end))]

    IDs = list(set(active['IDStation']))

    data = data[data.IDStation.isin(IDs)].groupby('IDStation').apply(lambda g: g.mean(skipna=True))

    clean_df = pd.DataFrame(data=data.iloc[:,-1].values, index=data.index, columns=[data.columns[-1]])

    if verbose:
        print(clean_df.shape)
        print(clean_df.info())
        

    return clean_df.copy()

In [None]:
pollutants = ['NO2', 'NOx', 'PM2.5', 'PM10']

In [None]:
registry = pd.read_csv('./data/registry.csv')

In [None]:
for pollutant in pollutants:

    df = pd.read_csv('./data/raw/{}_sit_monthly.csv'.format(pollutant))

    clean_df_2019 = clean_dataframe(df, registry, 'March 2019', 'July 2019', verbose=True)

    clean_df_2019.to_csv('./data/2019/{}_2019.csv'.format(pollutant))

    clean_df_2020 = clean_dataframe(df, registry, 'March 2020', 'July 2020', verbose=True)

    clean_df_2020.to_csv('./data/2020/{}_2020.csv'.format(pollutant))

    diff_df = clean_df_2019.join(clean_df_2020, on='IDStation', lsuffix='_2019', rsuffix='_2020')

    diff_df['Delta_{}'.format(pollutant)] = diff_df.iloc[:,0] - diff_df.iloc[:, 1]

    diff_df = diff_df.dropna()

    diff_df.to_csv('./data/{}.csv'.format(pollutant))

    print(diff_df.head())