In [1]:
# importing pandas and numpy
import pandas as pd
import numpy as np

In [2]:
#downloading the data from ECCC
O3_2010 = pd.read_csv("http://data.ec.gc.ca/data/air/monitor/national-air-pollution-surveillance-naps-program/Data-Donnees/2010/ContinuousData-DonneesContinu/HourlyData-DonneesHoraires/O3_2010.csv",parse_dates=['Date'],encoding='latin-1',skiprows=5) #opens the raw csv file and attempts to auto-identify the date formatting in the Date column
# parse_dates=['Date'] asks Pandas to try and auto-read the dates (seems to work)
# encoding='latin-1' tells pandas how to read this particular csv file
# skiprows=5 skips the first five rows that include the header information



In [3]:
# this function will create a time series for a given station id number (ie. 10102)
# requires NAPSID for station_id input and the O3_YEAR data frame as imported above (ie. O3_2010)
def create_O3_timeseries(station_id, O3_YEAR):
    result = [] # creates a empty list to add data to
    for i in range(0,len(O3_YEAR)): #loops through each element in raw dataframe
        if (O3_YEAR['NAPSID'][i] == station_id):
            for j in range (0,24): # loops for each hour of day
                # identifies the column name for hour of the day
                if (j<9):
                    O3_string = "H0"+str(j+1)
                else:
                    O3_string = "H"+str(j+1)
                
                # confirms that the NaN values (9999 and -999) aren't present in the ozone data
                if (O3_YEAR[O3_string][i] < 9999 and O3_YEAR[O3_string][i] > -999):
                    #appends a new row to the "results" list in the format [Date + hour, Ozone]
                    #this assumes that hour 1 = 00:00, hour 24 = 23:00
                    result.append([pd.to_datetime(O3_YEAR['Date'][i]+ pd.to_timedelta(j, unit='h')), O3_YEAR[O3_string][i]])
                # if the ozone value is 9999 or -999, then it is replaced with a NaN
                else:
                    #appends a new row to the "results" list in the format [Date + hour, Ozone]
                    result.append([pd.to_datetime(O3_YEAR['Date'][i]+ pd.to_timedelta(j, unit='h')), np.nan])
    #creates a data frame that stores the date and hourly ozone concentration for the specified NAPSID
    O3_ts = pd.DataFrame(result, columns = ['Date', 'Ozone'])
    #the output of this function is the ozone time series data frame
    return O3_ts


In [4]:
#a test example of how this function could run
O3_2010_10102 = create_O3_timeseries(10102, O3_2010)
O3_2010_10102.to_csv(r'C:\\Users\Sarah\O3_2010_10102.csv', na_rep='NaN', index=False)


In [5]:
#another test example of how this function could run
O3_2010_52001 = create_O3_timeseries(52001, O3_2010)
O3_2010_52001.to_csv(r'C:\\Users\Sarah\O3_2010_52001.csv', na_rep='NaN', index=False)


In [6]:
# You could automate going through all the station ids starting from:
NAPSID = O3_2010.NAPSID.drop_duplicates() # finds all the unique station IDS
NAPSID.index = range(len(NAPSID.index)) # re-indexes the object (because otherwise the index numbers match the original dataframe and looping is inefficient)
print(NAPSID)

0       10102
1       10301
2       10401
3       10501
4       10601
        ...  
208    129003
209    129102
210    129103
211    129401
212    129501
Name: NAPSID, Length: 213, dtype: int64
