## Fetch Weather Data

In [12]:
import os

import numpy as np
import pandas as pd
import csv
from pathlib import Path
import requests

import urllib.request
from pathlib import Path
import os

In [13]:
weather_token = 'LxKVcvGLVexqvtgBrqrlSJvUaURFdTsx' # s anadkat's personal token but use your own if possible

## Determine stations for wake county

In [14]:
def get_wakecounty_stations():
    token = weather_token
    endpoint = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/stations?'
    
    payload = {
        'datasetid':'GHCND',
        'locationid': 'FIPS:37183',
        'limit': 1000
    
    }

    headers = {'Token': token}
    response = requests.get(url = endpoint, params=payload ,headers = headers)
    response = response.json()

    return response['results']

In [15]:
stations = get_wakecounty_stations()

In [16]:
WakeCounty_df = pd.DataFrame(stations)
coverage_df = WakeCounty_df[WakeCounty_df['datacoverage'] >= 1.0]
coverage_df

Unnamed: 0,elevation,mindate,maxdate,latitude,name,datacoverage,id,elevationUnit,longitude
10,103.9,2007-10-08,2021-11-05,35.862421,"RALEIGH 5.9 ENE, NC US",1.0,GHCND:US1NCWK0013,METERS,-78.566454
34,107.9,2007-12-31,2008-05-07,35.890322,"RALEIGH 5.3 NNW, NC US",1.0,GHCND:US1NCWK0055,METERS,-78.70159
103,121.6,2015-12-16,2021-11-05,35.871259,"RALEIGH 3.6 NNE, NC US",1.0,GHCND:US1NCWK0203,METERS,-78.638678
152,119.2,2020-03-31,2021-11-05,35.642376,"HOLLY SPRINGS 1.7 WSW, NC US",1.0,GHCND:US1NCWK0320,METERS,-78.861934
155,133.2,2020-05-31,2021-11-05,35.8703,"RALEIGH 5.8 NW, NC US",1.0,GHCND:US1NCWK0328,METERS,-78.74275
164,128.0,1937-12-01,1940-11-30,35.866667,"CRABTREE CREEK, NC US",1.0,GHCND:USC00312118,METERS,-78.75
176,126.8,1944-05-18,2021-11-05,35.8923,"RALEIGH AIRPORT, NC US",1.0,GHCND:USW00013722,METERS,-78.7819


## Fetch weather data from 2018-2021

In [26]:
def get_weather_2018():
    
    token = weather_token
    
    endpoint = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND'
    
    payload = {
        
        'datatypeid': 'TAVG',
        'locationid': 'FIPS:37183',
        'startdate': '2018-01-01',
        'enddate': '2018-12-31',
        'limit':'1000',
        'stationid': 'GHCND:USW00013722',
        'units':'standard'
    }
    
    headers = {'Token': token}
    
    r = requests.get(url = endpoint, params=payload ,headers = headers)
    
    r = r.json()

    return r['results']

def get_weather_2019():
    
    token = weather_token
    
    endpoint = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND'
    
    payload = {
        
        'datatypeid': 'TAVG',
        'locationid': 'FIPS:37183',
        'startdate': '2019-01-01',
        'enddate': '2019-12-31',
        'limit':'1000',
        'stationid': 'GHCND:USW00013722',
        'units':'standard'
    }
    
    headers = {'Token': token}
    
    r = requests.get(url = endpoint, params=payload ,headers = headers)
    
    r = r.json()

    return r['results']

def get_weather_2020():
    
    token = weather_token
    
    endpoint = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND'
    
    payload = {
        
        'datatypeid': 'TAVG',
        'locationid': 'FIPS:37183',
        'startdate': '2020-01-01',
        'enddate': '2020-12-31',
        'limit':'1000',
        'stationid': 'GHCND:USW00013722',
        'units':'standard'
    }
    
    headers = {'Token': token}
    
    r = requests.get(url = endpoint, params=payload ,headers = headers)
    
    r = r.json()

    return r['results']

def get_weather_2021():
    
    token = weather_token
    
    endpoint = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND'
    
    payload = {
        
        'datatypeid': 'TAVG',
        'locationid': 'FIPS:37183',
        'startdate': '2021-01-01',
        'enddate': '2021-10-21',
        'limit':'1000',
        'stationid': 'GHCND:USW00013722',
        'units':'standard'
    }
    
    headers = {'Token': token}
    
    r = requests.get(url = endpoint, params=payload ,headers = headers)
    
    r = r.json()

    return r['results']
    

## Sourcing weather data 

In [27]:
def getWeatherData(forceFetch=False):
    path = Path()
    key = 'weatherdata.csv'
    filename = path/key
        
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched weather data...')
        df = pd.read_csv(key)
        print('weather df shape:', df.shape)
        return df
    
    else:
        # If the file does not already exist in the directory, download it
        print('Fetching weather data...')
        # fetching weather data and agg into frames
        weather2018_df = pd.DataFrame(get_weather_2018())
        weather2019_df = pd.DataFrame(get_weather_2019())
        weather2020_df = pd.DataFrame(get_weather_2020())
        weather2021_df = pd.DataFrame(get_weather_2021())

        weather_df = pd.concat([weather2018_df,weather2019_df,weather2020_df,weather2021_df])
        print('weather df shape:', weather_df.shape)
        weather_df.to_csv(key, index=False)
        print('Done')
        return weather_df

## Preprocessing

In [28]:
def preprocess_weatherdata(weather_df):
    # creating a copy so we don't mutate the orig frame
    df = weather_df.copy()
    
    # converting to date format
    df['date'] = pd.to_datetime(weather_df['date'])
    
    # dropping irrelevant features
    df = df.drop(['station','attributes'],axis=1)
    
    # reshaping df based on index 
    df = df.pivot(index='date',columns='datatype', values='value')
    
    return df

In [30]:
weather_df_raw = getWeatherData(forceFetch=False)
weather_df_raw.head()

Using pre-fetched weather data...
weather df shape: (1390, 5)


Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,TAVG,GHCND:USW00013722,"H,,S,",22.0
1,2018-01-02T00:00:00,TAVG,GHCND:USW00013722,"H,,S,",20.0
2,2018-01-03T00:00:00,TAVG,GHCND:USW00013722,"H,,S,",21.0
3,2018-01-04T00:00:00,TAVG,GHCND:USW00013722,"H,,S,",26.0
4,2018-01-05T00:00:00,TAVG,GHCND:USW00013722,"H,,S,",21.0


## Validating data

In [31]:
weather_df = preprocess_weatherdata(weather_df_raw)
display(weather_df.head())
display(weather_df.tail())

datatype,TAVG
date,Unnamed: 1_level_1
2018-01-01,22.0
2018-01-02,20.0
2018-01-03,21.0
2018-01-04,26.0
2018-01-05,21.0


datatype,TAVG
date,Unnamed: 1_level_1
2021-10-17,58.0
2021-10-18,56.0
2021-10-19,57.0
2021-10-20,60.0
2021-10-21,62.0


In [32]:
weather_df.isna().sum()

datatype
TAVG    0
dtype: int64

In [33]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1390 entries, 2018-01-01 to 2021-10-21
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TAVG    1390 non-null   float64
dtypes: float64(1)
memory usage: 21.7 KB
