# Exploratory Data Analysis

This notebook shows how to do a fast exploratory analysis on the available data.

## Data acquisition

#### Import libraries

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

#### Load data into pandas and summarize the data

In [None]:
# Create a dataframe reading all csv files
data_path = Path.home() / 'work/zurich-air-quality-data'
data_list={}
df=pd.DataFrame()
for year in range(1983,2026):
    file_name='ugz_ogd_air_d1_'+str(year)+'.csv'
    df_year = pd.read_csv(data_path / Path(file_name))
    df = pd.concat([df, df_year],ignore_index=True)

df.head

In [None]:
# Create the list of locations and parameters
locations=df["Standort"].unique()
parameters=df["Parameter"].unique()
locations
parameters

##### Transform the data to the right format

In [None]:
df[["Wert"]] = df[["Wert"]].apply(pd.to_numeric,errors='coerce')
df[["Datum"]] = df[["Datum"]].apply(pd.to_datetime,errors='coerce')

df.head()


## Data exploration

##### General summary for all the keys in the dataset

In [None]:
for key in df.keys():
    print(df[key].describe(include='all'))


##### Create a dictionary with all locations and contaminants

In [None]:
air_parameter_location={}
for location in locations:
    air_parameter_location[location]={}
    for parameter in parameters:
        air_parameter_location[location][parameter]=df.loc[df.Standort.isin([location]) & df.Parameter.isin([parameter]),["Datum","Wert","Einheit"]].reset_index()


##### Plot time series for each location and parameter

In [None]:
for location in locations:
    for parameter in parameters:
        if len(air_parameter_location[location][parameter])>10:
            # Plotting the time series of given dataframe
            fig1, ax1 = plt.subplots()
            plt.plot(air_parameter_location[location][parameter].Datum, air_parameter_location[location][parameter].Wert)
            
            # Giving title to the chart using plt.title
            plt.title('Concentration of ' + str(parameter) + ' in ' + str(location) +' by Date')
            
            # rotating the x-axis tick labels at 30degree 
            # towards right
            plt.xticks(rotation=30, ha='right')
            
            # Providing x and y label to the chart
            plt.xlabel('Date')
            units=air_parameter_location[location][parameter].Einheit.unique()
            plt.ylabel(str(parameter)+' in '+str((units)))
    
            plt.show()