### Set Gloabl variable

In [1]:
RAW_DATA = "./raw-data"
PROCESSED_DATA = "./processed-data"
STATION = "./station"
MODEL = "./model"

### Declare Helper Functions

In [2]:
import pandas as pd
from pandas import read_csv

# !pip install timezonefinder
from timezonefinder import TimezoneFinder
tf = TimezoneFinder(in_memory=True)

def getCoordinate(station="47267", filename = "Station-Inventory-EN.csv"):
    df = read_csv("{STATION}/{filename}".format(STATION=STATION, filename=filename, header= 0 ))
    #print(df.dtypes)
    #print(df[:2])
    
    #https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
    stationInfo = df.loc[df[df.columns[3]] == int(station)]
    
    #https://stackoverflow.com/questions/16729574/how-to-get-a-value-from-a-cell-of-a-dataframe
    latitude = stationInfo.iat[0, 6]
    longitude = stationInfo.iat[0, 7]
    
    return latitude, longitude
    
    
latitude, longitude = getCoordinate()
print(latitude, longitude)

44.22 -76.6


In [3]:
from datetime import datetime
import pytz

def getTimeAtStation(station="47267"):
    latitude, longitude = getCoordinate(station)
    timezone = tf.timezone_at(lng=longitude, lat=latitude) # America/Toronto

    # https://docs.python.org/3/library/datetime.html
    # naive_dt = datetime.now()
    # naive_utc_dt = datetime.utcnow()
    station_now = datetime.now(pytz.timezone(timezone))
    return [station_now.year, station_now.month, station_now.day, station_now.hour, station_now.minute]

print(getTimeAtStation())

# print(station_now) #datetime object
# print(datetime(\
#     station_now.year, station_now.month, station_now.day, station_now.hour, station_now.minute)) #datetime object
# print(station_now.strftime("%Y-%m-%d %H:%M")) # string
# print(station_now.year)
# print(station_now.month)
# print(station_now.day)
# print(station_now.hour)
# print(station_now.minute)
# print(station_now.second)
# print(station_now.microsecond)

[2019, 8, 9, 7, 36]


In [4]:
import urllib.request

'''
Download weather data on specific month and year at given station ID 
Arg : year, month, stationID(47267 = Kingston Climate Station)
Return : none
Output : {stationID}-{year}-{month}.csv file under /raw-data directory
c.f.: https://stackoverflow.com/questions/50260574/wget-content-disposition-using-python
'''
def download(year, month, station, filename):
    url = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID={station}&Year={year}&Month={month}&Day=14&timeframe=1&submit=Download+Data".format(year=year, month=month, station=station)
    urllib.request.urlretrieve(url, filename)

# download(2019,8)

In [5]:
def generateFilename(year, month, station):
    filename = "{RAW_DATA}/{station}-{year}-{month:02d}.csv".format(\
                RAW_DATA = RAW_DATA, station=station, year=year, month=month)
    return filename

def downloadFromTo(fromWhen=[2015,1], toWhen=[2019,7], station="47267"):
    filenames = []
    
    if fromWhen[0] < toWhen[0]:
        
        # download the first year
        for month in range(fromWhen[1], 13):
            filename = generateFilename(fromWhen[0], month, station)
            filenames.append(filename)
            download(fromWhen[0], month, station, filename)
        
        # download the middle years
        for year in range(fromWhen[0]+1, toWhen[0]):
            for month in range(1,13):
                filename = generateFilename(year, month, station)
                filenames.append(filename)
                download(year, month, station, filename)
                
        # download the last year
        for month in range(1, toWhen[1]+1):
            filename = generateFilename(toWhen[0], month, station)
            filenames.append(filename)
            download(toWhen[0], month, station, filename)
        
    elif fromWhen[0] is toWhen[0]:
        for month in range(fromWhen[1], toWhen[1]+1):
            filename = generateFilename(fromWhen[0], month, station)
            filenames.append(filename)
            download(fromWhen[0], month, station, filename)

    else:
        print("use vaild years")
    
    return filenames

In [6]:
import pandas as pd
from pandas import read_csv

def createDataFrame(filenames):
    # load csv files and combine
    df = read_csv(filenames[0], skiprows = 15, header = 0)
    for count in range(1, len(filenames)):
        df = pd.concat([df, read_csv(filenames[count], skiprows = 15, header = 0)])
    return df

station, toWhen = [2019,8]

# https://stackoverflow.com/questions/15891038/change-data-type-of-columns-in-pandas/44536326
# ex) 01:00 -> 1.0
def convertHourToFloat(df):
    df[df.columns[4]] = df[df.columns[4]].str.slice(0,2).astype("float64")
    return df

def deleteColumns(df, to_be_deleted = [1, 3, 6, 8, 10, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23] ):
    df = df.drop(df.columns[to_be_deleted], axis=1)
    return df.set_index(df.columns[0])
    
def interpolateNA(df, limit=500):
    # print(df.isnull().sum())
    df = df.interpolate(method='linear', limit_direction ='both', limit=500)
    print(df.isnull().sum())
    return df

def temperatureFirst(df):
    cols = df.columns.tolist()
    cols = cols[2:] + cols[:2]
    df = df[cols]
    return df

import calendar

'''
if df contains dummy data on and after current day, this function truncates df
if today is 2019-8-8, this functioon delete dummy data from 2019-8-8 to the last day of the month

argument
    df: pandas dataframe
    toWhen: array that contains [year, month]
    now_at_station: array that contains current [year, month, day, hour, minute]
return
    
    truncated dataframe
'''
def truncateBack(df, toWhen, now_at_station):
    if toWhen[0] == now_at_station[0] and toWhen[1] == now_at_station[1]:
        lastDay = calendar.monthrange(now_at_station[0], now_at_station[1])[1]
        df = df[:len(df) - (lastDay - now_at_station[2] + 1) * 24 ]
    return df

def saveDataFrame(df, filename="export.csv"):
    df.to_csv (r'{PROCESSED_DATA}/{filename}'.format(PROCESSED_DATA=PROCESSED_DATA, filename=filename, header=True))


### Function to build and save dataframe to train model

In [7]:
def buildDataFrame(fromWhen = [2015,1], toWhen = [2019,8], station = "47267", save = True, saveFile = "example.csv"):
    files = downloadFromTo(fromWhen,toWhen, station)
    df = createDataFrame(files)
    df = convertHourToFloat(df)
    df = deleteColumns(df)
    now_at_station = getTimeAtStation(station)
    df = truncateBack(df, toWhen, now_at_station)
    df = interpolateNA(df)
    df = temperatureFirst(df)
    if save:
        saveDataFrame(df, saveFile)
    return df

# print(buildDataFrame([2015,1], [2019,8], "47267", True, "modelSequence.csv")[:3])

### Function to build and save dataframe to predict temperature 24hours later

In [8]:
import pandas as pd
from pandas import read_csv

def getLastestData(station = "47267", length = 480, save = True, saveFile = "latestSequence.csv"):
    assert length < 1000,\
        print("don't use too long sequence, lengh should be lesser than 1000")
    
    year, month, day, _, _ = getTimeAtStation(station)
    if (day-1)*24 < length:
        if month is not 1:
            df = buildDataFrame([year, month-1], [year, month], station, False)
        if month is 1:
            df = buildDataFrame([year - 1, 12], [year, month], station, False)
    
    df = df[-length-23:]
    if save:
        saveDataFrame(df, saveFile)
    
    return df
    
            
df = getLastestData("47267", 480, True, "latestSequence.csv")
print(df)
        

Month                  0
Time                   0
Temp (°C)              0
Dew Point Temp (°C)    0
Rel Hum (%)            0
Wind Dir (10s deg)     0
Wind Spd (km/h)        0
Stn Press (kPa)        0
dtype: int64
                  Temp (°C)  Dew Point Temp (°C)  Rel Hum (%)  \
Date/Time                                                       
2019-07-19 01:00       21.1                 21.1        100.0   
2019-07-19 02:00       21.3                 21.3        100.0   
2019-07-19 03:00       21.8                 21.8        100.0   
2019-07-19 04:00       22.1                 22.1        100.0   
2019-07-19 05:00       22.2                 22.2        100.0   
2019-07-19 06:00       23.1                 23.1        100.0   
2019-07-19 07:00       24.3                 23.5         96.0   
2019-07-19 08:00       25.5                 23.4         88.0   
2019-07-19 09:00       27.2                 23.6         81.0   
2019-07-19 10:00       25.2                 24.1         94.0   
2019-07