In [1]:
import pandas as pd
import numpy as np

In [None]:
def clean_dataframe(df, registry, start, end, verbose=False):

    registry['DateStop'] = pd.to_datetime(registry['DateStop'])
    registry = registry[registry['Pollutant'] == pollutant]
    active = registry[(pd.isna(registry.DateStop)) | (registry.DateStop > pd.to_datetime(end))]

    df['Date'] = pd.to_datetime(df['Date'])

    data = df[(df.Date > pd.to_datetime(start)) & (df.Date < pd.to_datetime(end))]
    data = data.groupby('IDStation', as_index=False).apply(lambda g: g.mean(skipna=True))
    data['IDStation'] = data['IDStation'].astype(int)


    active = registry[(pd.isna(registry.DateStop)) | (registry.DateStop > pd.to_datetime(end))]

    clean_df = pd.merge(data, active[['IDStation', 'Latitude', 'Longitude', 'Altitude']], on='IDStation', how='inner')

    if verbose:
        print(clean_df.shape)
        print(clean_df.info())        

    return clean_df

In [None]:
def normalize_and_shift(df, feature, verbose=False, output=None):

    min_value = df[feature].min()
    max_value = df[feature].max()

    if verbose:
        print('\n' + feature)
        print('Min: {}, Max: {}'.format(min_value, max_value))
    
    if output:
        output.write('\n' + feature + '\n')
        output.write('Min: {}, Max: {}\n'.format(min_value, max_value))
    
    normalized = df[feature].copy()

    if min_value < 0:

        normalized += np.abs(min_value)
        max_value += np.abs(min_value)

        if verbose: 
            print('Some negative values found, with ratio: {}\n'.format(round(100*(np.abs(min_value) / max_value), 2)))

        if output:
            output.write('Some negative values found, with ratio: {}\n'.format(round(100*(np.abs(min_value) / max_value), 2)))
    else:

        if verbose: print('Only positive values found.\n')

        if output:  output.write('Only positive values found.\n')
  
    df[feature + '_Norm'] = normalized / max_value 

    if verbose: print(df.head(3))

    return df

In [None]:
def normalize(df, feature1, feature2, verbose=False, output=None):

    min_value_1 = df[feature1].min()
    max_value_1 = df[feature1].max()
    min_value_2 = df[feature2].min()
    max_value_2 = df[feature2].max()

    max_value = max_value_1 if max_value_1 > max_value_2 else max_value_2
    min_value = min_value_1 if min_value_1 < min_value_2 else min_value_2

    if verbose:
        print('\n' + feature1 + '\n')
        print('Min: {}, Max: {}\n'.format(min_value_1, max_value_1))

        print('\n' + feature2 + '\n')
        print('Min: {}, Max: {}\n'.format(min_value_2, max_value_2))
    
    if output:
        output.write('\n' + feature1 + '\n')
        output.write('Min: {}, Max: {}\n'.format(min_value_1, max_value_1))

        output.write('\n' + feature2 + '\n')
        output.write('Min: {}, Max: {}\n'.format(min_value_2, max_value_2))
    
    normalized = df[feature2].copy()
    
    df[feature1 + '_Norm'] = df[feature1] / max_value     
 
    df[feature2 + '_Norm'] = normalized / max_value 

    if verbose: print(df.head(3))

    return df

In [None]:
pollutants = ['NO2', 'NOx', 'PM2.5', 'PM10']

In [None]:
registry = pd.read_csv('./data/registry.csv')

In [None]:
download = True

In [None]:
with open('./data/report.txt', 'w') as f:

    for pollutant in pollutants:

        df = pd.read_csv('./data/raw/{}_sit_monthly.csv'.format(pollutant)).drop('Unnamed: 0', axis=1)

        clean_df_2019 = clean_dataframe(df, registry, 'March 2019', 'July 2019', verbose=False)

        if download: clean_df_2019.to_csv('./data/2019/{}_2019.csv'.format(pollutant))

        clean_df_2020 = clean_dataframe(df, registry, 'March 2020', 'July 2020', verbose=False)

        if download: clean_df_2020.to_csv('./data/2020/{}_2020.csv'.format(pollutant))

        df = pd.merge(clean_df_2019, clean_df_2020, on=['IDStation', 'Latitude', 'Longitude', 'Altitude'], suffixes=['_2019', '_2020'])

        df['{}_Delta'.format(pollutant)] = df[pollutant+ '_2019'] - df[pollutant + '_2020']

        df = df[['IDStation', pollutant+ '_2019', pollutant+ '_2020', pollutant+ '_Delta', 'Latitude', 'Longitude']]

        df = df.dropna()  

        df['{}_Abs'.format(pollutant)] = np.abs(df['{}_Delta'.format(pollutant)])

        df = normalize_and_shift(df, pollutant + '_Delta', verbose=True, output=f)

        df = normalize(df, pollutant + '_2019', pollutant + '_2020', verbose=True, output=f)
        
        if download: df.to_csv('./data/{}.csv'.format(pollutant))        