In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read in csv files with pollution data and clean up

In [2]:
# read in pollution data
df_torkel_knutssong = pd.read_csv('../air_weather_data/torkel_knutssongatan.csv', sep=';', skiprows=11)
df_hornsgatan = pd.read_csv('../air_weather_data/hornsgatan.csv', sep=';', skiprows=11)
df_norr_malma = pd.read_csv('../air_weather_data/norr_malma.csv', sep=';', skiprows=11)

In [3]:
# renaming relevant columns
df_torkel_knutssong.rename(columns={'Slut':'Date', 
                                    'PM10 (107)':'PM$_{10}$, Torkel Knutssonsgatan', 
                                    'PM2.5 (102)':'PM$_{2.5}$, Torkel Knutssonsgatan',
                                    'NO2 (105)':'NO$_2$, Torkel Knutssonsgatan'}, 
                                    inplace=True)

df_hornsgatan.rename(columns={'Slut':'Date',
                              'PM10 (116)':'PM$_{10}$, Hornsgatan',
                              'PM2.5 (111)':'PM$_{2.5}$, Hornsgatan'}, 
                              inplace=True)

df_norr_malma.rename(columns={'Slut':'Date',
                              'PM10 (291)':'PM$_{10}$, Norr Malma',
                              'PM2.5 (295)':'PM$_{2.5}$, Norr Malma'}, 
                              inplace=True)

In [4]:
# drop irrelevant columns
df_torkel_knutssong.drop('Start', axis=1, inplace=True)
df_hornsgatan.drop(['Start', 'NO2 (115)'], axis=1, inplace=True)
df_norr_malma.drop(['Start', 'NO2 (293)'], axis=1, inplace=True)

In [5]:
# change date columns to pandas datetime format and make it index column
df_torkel_knutssong['Date'] = pd.to_datetime(df_torkel_knutssong['Date'], format = '%Y-%m-%d %H:%M')
df_torkel_knutssong = df_torkel_knutssong.set_index('Date') 

df_hornsgatan['Date'] = pd.to_datetime(df_hornsgatan['Date'], format = '%Y-%m-%d %H:%M')
df_hornsgatan = df_hornsgatan.set_index('Date') 

df_norr_malma['Date'] = pd.to_datetime(df_norr_malma['Date'], format = '%Y-%m-%d %H:%M')
df_norr_malma = df_norr_malma.set_index('Date')

In [6]:
# make copy of torkel knutssong dataframe and merge one at a time

df_pollution = df_torkel_knutssong[:]
df_pollution = df_pollution.merge(df_hornsgatan, on='Date')
df_pollution = df_pollution.merge(df_norr_malma, on='Date')

In [7]:
df_pollution;

# Read in weather data and clean up

In [8]:
path = '../air_weather_data/'

temperature = pd.read_csv(path + 'temperature.csv', sep=';', header = 2, encoding='latin')
pressure = pd.read_csv(path + 'pressure.csv', sep=';', header = 2, encoding='latin')
relative_humidity = pd.read_csv(path + 'relative_humidity.csv', sep=';', header = 2, encoding='latin')
precipitation = pd.read_csv(path + 'precipitation.csv', sep=';', header = 2, encoding='latin')
solar_radiation = pd.read_csv(path + 'solar_radiation.csv', sep=';', header = 2, encoding='latin')
wind_speed = pd.read_csv(path + 'wind_speed.csv', sep=';', header = 2, encoding='latin')

# make list with dataframes
dframes = []
dframes += [temperature, pressure, relative_humidity, precipitation, solar_radiation, wind_speed]
column_names = ['Temperature', 
                'Atmospheric pressure', 
                'Relative humidity', 
                'Precipitation',
                'Solar radiation',
                'Wind speed']

# remove unnecessary columns, turn date into index
i = 0
for df in dframes:
    df.drop(df.columns[[3,4,5,6]], axis=1, inplace=True)
    df.insert(0, 'Date', df['Datum'] + ' ' + df['Kl']) # insert a date column with date and hour
    df.drop(['Datum', 'Kl'], axis=1, inplace=True) # drop columns Datum and Kl
    df['Date'] = pd.to_datetime(df['Date'], format = '%y-%m-%d %H:%M') # change date column to datetime format
    dframes[i].columns.values[1] = column_names[i]
    df.set_index('Date', inplace=True) # set date as index column
    i += 1

In [9]:
df_weather = dframes[0]
# merge all dataframes
df_weather = df_weather.merge(dframes[1], on='Date')
df_weather = df_weather.merge(dframes[2], on='Date')
df_weather = df_weather.merge(dframes[3], on='Date')
df_weather = df_weather.merge(dframes[4], on='Date')
df_weather = df_weather.merge(dframes[5], on='Date')

df_weather;

# Merge pollution and weather dataframes

In [10]:
# merge weather and pollution data
# df_pollution_weather = df.merge()
df_final = df_pollution.merge(df_weather, on='Date')

# Make cosine and sine signals for day, week, and year

In [11]:
import matplotlib.dates as mdates

# sine and cosine of day, week, and year to capture seasonality of pollution data
def add_sine_cosine(df):
    # getting datetime columns and converting to seconds
    timestamp_s = df.index.map(pd.Timestamp.timestamp)
    day = 24*60*60
    week = 7*day
    year = (365.2425)*day
    df['Sine day'] = np.sin(timestamp_s * (2 * np.pi / day))
    df['Cosine day'] = np.cos(timestamp_s * (2 * np.pi / day))    
    df['Sine week'] = np.sin(timestamp_s * (2 * np.pi / week))
    df['Cosine week'] = np.cos(timestamp_s * (2 * np.pi / week))
    df['Sine year'] = np.sin(timestamp_s * (2 * np.pi / year))
    df['Cosine year'] = np.cos(timestamp_s * (2 * np.pi / year))
    return df

# apply above function to df_final
df_final = add_sine_cosine(df_final)

# Save final dataset to csv

In [12]:
df_final.to_csv('../air_weather_data/pollution_and_weather.csv')