In [130]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
import requests
import pathlib
import json
from datetime import datetime

In [33]:
# Load API keys
from dotenv import load_dotenv
import os
load_dotenv() 
# print(type(os.getenv("VISUAL_CROSSING_API_KEY")))

<class 'str'>


In [191]:
# Define some constants
latitude = [40.79736, 41.78701, 30.1444, 25.7738]
longitude = [-73.97785, -87.77166, -97.66876, -80.1936]
cities = ["ny", "il", "tx", "fl"]
start_date = "2016-01-01"
end_date = "2024-03-12"

## API Calls to collect historical weather data

In [55]:
def getDataFromOpenMeteo(latitude, longitude, startDate, endDate, fileName):
  # Data Source 1
	# Setup the Open-Meteo API client with cache and retry on error
	cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
	retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
	openmeteo = openmeteo_requests.Client(session = retry_session)

	# Make sure all required weather variables are listed here
	# The order of variables in hourly or daily is important to assign them correctly below
	url = "https://archive-api.open-meteo.com/v1/archive"
	params = {
		"latitude": latitude,
		"longitude": longitude,
		"start_date": startDate,
		"end_date": endDate,
		"daily": ["temperature_2m_max", "temperature_2m_min", "sunshine_duration", "precipitation_hours", "wind_speed_10m_max"],
	}
	responses = openmeteo.weather_api(url, params=params)

	# Process first location. Add a for-loop for multiple locations or weather models
	response = responses[0]
	print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
	print(f"Elevation {response.Elevation()} m asl")
	print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
	print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

	# Process daily data. The order of variables needs to be the same as requested.
	daily = response.Daily()
	daily_temperature_2m_max = daily.Variables(0).ValuesAsNumpy()
	daily_temperature_2m_min = daily.Variables(1).ValuesAsNumpy()
	daily_sunshine_duration = daily.Variables(2).ValuesAsNumpy()
	daily_precipitation_hours = daily.Variables(3).ValuesAsNumpy()
	daily_wind_speed_10m_max = daily.Variables(4).ValuesAsNumpy()

	daily_data = {"date": pd.date_range(
		start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
		end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
		freq = pd.Timedelta(seconds = daily.Interval()),
		inclusive = "left"
	)}
	daily_data["temperature_2m_max"] = daily_temperature_2m_max
	daily_data["temperature_2m_min"] = daily_temperature_2m_min
	daily_data["sunshine_duration"] = daily_sunshine_duration
	daily_data["precipitation_hours"] = daily_precipitation_hours
	daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max

	daily_dataframe = pd.DataFrame(data = daily_data)
	daily_dataframe.to_csv("openMeteo_" + '_'.join([fileName, startDate, 'to', endDate]) +  ".csv", index=False)
	return daily_dataframe
	# print(daily_dataframe)


In [101]:
# WARNING!!!
# This has already been run for 2016-01-01 to 2024-03-12
# Don't re-run and make repeated API calls unless needed
# Get data from OpenMeteo

# daily_data = []
# for i in range(len(latitude)):
#   daily_data.append(getDataFromOpenMeteo(latitude[i], longitude[i], start_date, end_date, cities[i]))

# print(len(daily_data))

In [84]:
def getDataFromVisualCrossing(latitude, longitude, startDate, endDate, fileName):
  url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/" + str (latitude) + \
        "%2C" + str(longitude) + "/" + startDate + "/" + endDate + "?unitGroup=us&include=days&key="+ os.getenv("VISUAL_CROSSING_API_KEY") + "&contentType=json"
  print(url)
  print("https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/40.79736%2C-73.97785/2016-01-01/today?unitGroup=us&include=days&key=MHFU2QHX7NTY5RTWZPAT7VBXS&contentType=json")
# "https://weather.visualcrossing.com/VisualCrosingWebServices/rest/services/timeline/
# 40.79736%2C-73.97785/2016-01-01/today?unitGroup=us&include=days&key=MHFU2QHX7NTY5RTWZPAT7VBXS&contentType=json"


  payload={}
  headers = {}

  response = requests.request("GET", url, headers=headers, data=payload)
  pathlib.Path("visualCrossing_" + '_'.join([fileName, startDate, 'to', endDate]) + '.json').write_bytes(response.content)

  print(response.text)

In [102]:
# WARNING!!!
# Will incur an API cost, don't re-run
# Historical Data is saved to a CSV

# visual_crossing_data = []
# for i in range(len(latitude)):
  # daily_data.append(
  # visual_crossing_data.append(getDataFromVisualCrossing(latitude[i], longitude[i], start_date, end_date, cities[i]))

## Reading Data from the CSV and JSON Files creates from the API calls

In [110]:
def readStoredJSONData(fileName):
  with open(fileName, 'r') as file:
    # Reading from json file
    data = json.load(file)
  return data

def readStoredCSVData(fileName):
  df = pd.read_csv(fileName)
  return df

In [151]:
# Read all visual crossing files
vc_data = []
for i in range(len(latitude)):
  fileName = "visualCrossing_" + '_'.join([cities[i], start_date, 'to', end_date]) + '.json'
  vc_data.append(readStoredJSONData(fileName))

In [143]:
type(vc_data[0]['days'])

list

In [155]:
for cityData in vc_data:
  city_df = pd.DataFrame(cityData['days'])
  city_df = city_df[['datetime', 'tempmax', 'tempmin', 'humidity', 'windspeed']]
  city_df['datetime'] = city_df['datetime'].apply(lambda x : str(x))
  print(city_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   datetime   2994 non-null   object 
 1   tempmax    2994 non-null   float64
 2   tempmin    2994 non-null   float64
 3   humidity   2994 non-null   float64
 4   windspeed  2994 non-null   float64
dtypes: float64(4), object(1)
memory usage: 117.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   datetime   2994 non-null   object 
 1   tempmax    2994 non-null   float64
 2   tempmin    2994 non-null   float64
 3   humidity   2994 non-null   float64
 4   windspeed  2994 non-null   float64
dtypes: float64(4), object(1)
memory usage: 117.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #  

In [156]:
vc_dfs = []
for cityData in vc_data:
  city_df = pd.DataFrame(cityData['days'])
  city_df = city_df[['datetime', 'tempmax', 'tempmin', 'humidity', 'windspeed']]
  print(city_df.info())
  vc_dfs.append(city_df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   datetime   2994 non-null   object 
 1   tempmax    2994 non-null   float64
 2   tempmin    2994 non-null   float64
 3   humidity   2994 non-null   float64
 4   windspeed  2994 non-null   float64
dtypes: float64(4), object(1)
memory usage: 117.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   datetime   2994 non-null   object 
 1   tempmax    2994 non-null   float64
 2   tempmin    2994 non-null   float64
 3   humidity   2994 non-null   float64
 4   windspeed  2994 non-null   float64
dtypes: float64(4), object(1)
memory usage: 117.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 5 columns):
 #  

In [158]:
# Read all open meteo files
om_dfs = []
for i in range(len(latitude)):
  fileName = "openMeteo_" + '_'.join([cities[i], start_date, 'to', end_date]) + '.csv'
  om_df = readStoredCSVData(fileName)
  # print(type(om_df['date'][0]))
  # om_df['date'] = om_df['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
  om_df['date'] = om_df['date'].apply(lambda x: x[:10])
  om_df.set_index('date')
  om_dfs.append(om_df)

In [159]:
om_data[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 2994 non-null   object 
 1   temperature_2m_max   2992 non-null   float64
 2   temperature_2m_min   2992 non-null   float64
 3   sunshine_duration    2992 non-null   float64
 4   precipitation_hours  2994 non-null   float64
 5   wind_speed_10m_max   2992 non-null   float64
dtypes: float64(5), object(1)
memory usage: 140.5+ KB


In [172]:
# print(vc_dfs[0].info())
# print(om_dfs[0].info())

for vc_df in vc_dfs:
  vc_df.columns = ['date', 'tmax_vc', 'tmin_vc', 'humi_vc', 'wind_vc']
  vc_df.set_index('date')
  
for om_df in om_dfs:
  om_df.columns = ['date', 'tmax_om', 'tmin_om', 'sund_om', 'prec_om', 'wind_om']
  om_df.set_index('date')

print(vc_dfs[0].head())
print(type(vc_dfs[0]['date']))
print(om_dfs[0].head())
# all_dfs = vc_dfs + om_dfs
# all_dfs = pd.concat(all_dfs)
# print(all_dfs[0].info())

         date  tmax_vc  tmin_vc  humi_vc  wind_vc
0  2016-01-01     42.2     35.1     52.5     13.9
1  2016-01-02     39.8     33.3     47.4     13.7
2  2016-01-03     44.3     34.9     48.6     16.3
3  2016-01-04     36.1     15.2     46.4     19.2
4  2016-01-05     28.6     11.3     38.8     14.8
<class 'pandas.core.series.Series'>
         date  tmax_om  tmin_om    sund_om  prec_om    wind_om
0  2016-01-01     6.73     1.88  27001.312      0.0  21.077686
1  2016-01-02     4.38    -1.42  29190.664      0.0  15.856356
2  2016-01-03     6.58    -0.97  29205.188      0.0  19.083395
3  2016-01-04     2.08    -5.47  29141.408      0.0  24.280659
4  2016-01-05    -1.22   -10.02  29472.777      0.0  26.282465


In [183]:
for vc_df in vc_dfs:
  vc_df.columns = ['date', 'tmax_vc', 'tmin_vc', 'humi_vc', 'wind_vc']
  vc_df.set_index('date')
  
for om_df in om_dfs:
  om_df.columns = ['date', 'tmax_om', 'tmin_om', 'sund_om', 'prec_om', 'wind_om']
  om_df.set_index('date')

# print(vc_dfs[0].head())
# print(type(vc_dfs[0]['date']))
# print(om_dfs[0].head())
# print(type(om_dfs[0]['date']))
merged_dfs = []

for i in range(len(vc_dfs)):
  merged_df = pd.merge(vc_dfs[i], om_dfs[i]) 
  # , left_index=True, right_index=True)
  merged_dfs.append(merged_df)

print(merged_dfs[0].info())
print(merged_dfs[0].head())

# Merged_dfs is a single DF will all data points from different sources

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   date     2994 non-null   object 
 1   tmax_vc  2994 non-null   float64
 2   tmin_vc  2994 non-null   float64
 3   humi_vc  2994 non-null   float64
 4   wind_vc  2994 non-null   float64
 5   tmax_om  2992 non-null   float64
 6   tmin_om  2992 non-null   float64
 7   sund_om  2992 non-null   float64
 8   prec_om  2994 non-null   float64
 9   wind_om  2992 non-null   float64
dtypes: float64(9), object(1)
memory usage: 234.0+ KB
None
         date  tmax_vc  tmin_vc  humi_vc  wind_vc  tmax_om  tmin_om  \
0  2016-01-01     42.2     35.1     52.5     13.9     6.73     1.88   
1  2016-01-02     39.8     33.3     47.4     13.7     4.38    -1.42   
2  2016-01-03     44.3     34.9     48.6     16.3     6.58    -0.97   
3  2016-01-04     36.1     15.2     46.4     19.2     2.08    -5.47   
4  2016-01-05     28.6 

## Serialising and De-serialising the final DFs for the historical data

In [186]:
# Store the merged_df DataFrame to disk for later computations
for i in range(len(cities)):
  merged_dfs[i].to_pickle("./merged_df_" + cities[i] + ".pkl")  

In [190]:
# Unpickle the DataFrames
city_history_dfs = []

for i in range(len(cities)):
  city_history_dfs.append(pd.read_pickle("./merged_df_" + cities[i] + ".pkl"))

print("Loaded DFs for " + str(len(city_history_dfs)) + " cities.\n")
print(city_history_dfs[0].info())

Loaded DFs for 4 cities.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   date     2994 non-null   object 
 1   tmax_vc  2994 non-null   float64
 2   tmin_vc  2994 non-null   float64
 3   humi_vc  2994 non-null   float64
 4   wind_vc  2994 non-null   float64
 5   tmax_om  2992 non-null   float64
 6   tmin_om  2992 non-null   float64
 7   sund_om  2992 non-null   float64
 8   prec_om  2994 non-null   float64
 9   wind_om  2992 non-null   float64
dtypes: float64(9), object(1)
memory usage: 234.0+ KB
None
