In [1]:
import pandas as pd
from yyyy_mm_dd import *
import numpy as np

In [2]:
df = pd.read_csv("forecasts.csv", parse_dates=['timestamp'])
df

Unnamed: 0,timestamp,type,forecast_delta,city,summary,precipitation,temperature,max,min
0,2021-04-26 07:40:55.852295,daily,0,Amsterdam,Partly Cloudy,1,,11.0,2.0
1,2021-04-26 07:40:55.852295,daily,1,Amsterdam,Partly Cloudy,8,,15.0,4.0
2,2021-04-26 07:40:55.852295,daily,2,Amsterdam,Mostly Sunny,5,,16.0,6.0
3,2021-04-26 07:40:55.852295,daily,3,Amsterdam,Light Rain,82,,11.0,5.0
4,2021-04-26 07:40:55.852295,daily,4,Amsterdam,Showers,47,,12.0,4.0
...,...,...,...,...,...,...,...,...,...
123727,2021-05-09 20:00:41.644181,hourly,43,Cairo,Sunny,0,33.0,,
123728,2021-05-09 20:00:41.644181,hourly,44,Cairo,Sunny,0,32.0,,
123729,2021-05-09 20:00:41.644181,hourly,45,Cairo,Clear,0,29.0,,
123730,2021-05-09 20:00:41.644181,hourly,46,Cairo,Clear,0,28.0,,


In [53]:
hourly = df[df.type == 'hourly'].copy().reset_index()
hourly = hourly.drop(columns=['min', 'max'])
hourly['timestamp'] = [ start_of_yyyy_mm_dd_hh(timestamp) for timestamp in hourly['timestamp'] ]
hourly['forecast_for'] = [ move_yyyy_mm_dd_hh(timestamp, delta)
                           for timestamp, delta in zip(hourly['timestamp'], hourly['forecast_delta']) ]
hourly['weather_type'] = pd.Series([ summary.replace(" / Wind", "") for summary in hourly['summary'] ]).astype("category")
hourly

Unnamed: 0,index,timestamp,type,forecast_delta,city,summary,precipitation,temperature,forecast_for,weather_type
0,15,2021-04-26 07:00:00,hourly,1,Amsterdam,Partly Cloudy,1,8.0,2021-04-26 08:00:00,Partly Cloudy
1,16,2021-04-26 07:00:00,hourly,2,Amsterdam,Partly Cloudy,0,9.0,2021-04-26 09:00:00,Partly Cloudy
2,17,2021-04-26 07:00:00,hourly,3,Amsterdam,Mostly Sunny,0,9.0,2021-04-26 10:00:00,Mostly Sunny
3,18,2021-04-26 07:00:00,hourly,4,Amsterdam,Mostly Sunny,0,9.0,2021-04-26 11:00:00,Mostly Sunny
4,19,2021-04-26 07:00:00,hourly,5,Amsterdam,Sunny,0,10.0,2021-04-26 12:00:00,Sunny
...,...,...,...,...,...,...,...,...,...,...
94267,123727,2021-05-09 20:00:00,hourly,43,Cairo,Sunny,0,33.0,2021-05-11 15:00:00,Sunny
94268,123728,2021-05-09 20:00:00,hourly,44,Cairo,Sunny,0,32.0,2021-05-11 16:00:00,Sunny
94269,123729,2021-05-09 20:00:00,hourly,45,Cairo,Clear,0,29.0,2021-05-11 17:00:00,Clear
94270,123730,2021-05-09 20:00:00,hourly,46,Cairo,Clear,0,28.0,2021-05-11 18:00:00,Clear


In [54]:
to_join = hourly[hourly.forecast_delta == 0].drop(columns=['forecast_for']).rename(columns={'timestamp': 'forecast_for'}).set_index(['city', 'forecast_for'])[['weather_type']].copy()
hourly = hourly.join(
    to_join,
    on=['city', 'forecast_for'],
    how='inner',
    rsuffix="_actual"
)
hourly

Unnamed: 0,index,timestamp,type,forecast_delta,city,summary,precipitation,temperature,forecast_for,weather_type,weather_type_actual
1,16,2021-04-26 07:00:00,hourly,2,Amsterdam,Partly Cloudy,0,9.0,2021-04-26 09:00:00,Partly Cloudy,Mostly Sunny
288,393,2021-04-26 08:00:00,hourly,1,Amsterdam,Partly Cloudy,0,9.0,2021-04-26 09:00:00,Partly Cloudy,Mostly Sunny
576,771,2021-04-26 09:00:00,hourly,0,Amsterdam,Mostly Sunny,0,10.0,2021-04-26 09:00:00,Mostly Sunny,Mostly Sunny
2,17,2021-04-26 07:00:00,hourly,3,Amsterdam,Mostly Sunny,0,9.0,2021-04-26 10:00:00,Mostly Sunny,Mostly Sunny
289,394,2021-04-26 08:00:00,hourly,2,Amsterdam,Partly Cloudy,0,9.0,2021-04-26 10:00:00,Partly Cloudy,Mostly Sunny
...,...,...,...,...,...,...,...,...,...,...,...
93076,122176,2021-05-09 16:00:00,hourly,4,Cairo,Partly Cloudy,1,28.0,2021-05-09 20:00:00,Partly Cloudy,Partly Cloudy
93363,122553,2021-05-09 17:00:00,hourly,3,Cairo,Partly Cloudy,1,29.0,2021-05-09 20:00:00,Partly Cloudy,Partly Cloudy
93650,122930,2021-05-09 18:00:00,hourly,2,Cairo,Partly Cloudy,1,29.0,2021-05-09 20:00:00,Partly Cloudy,Partly Cloudy
93937,123307,2021-05-09 19:00:00,hourly,1,Cairo,Clear,1,29.0,2021-05-09 20:00:00,Clear,Partly Cloudy


In [68]:
def adjust_forecast(df, row):
    df = df[(df.city == row.city) & (df.forecast_delta == row.forecast_delta)]
    prediction = row.weather_type
    
    probabilities = {}
    for weather in df.weather_type.cat.categories:
        p_weather = len(df[df.weather_type_actual == weather])
        if p_weather == 0:
            continue
        
        p_prediction = len(df[df.weather_type == prediction])
        p_prediction_given_weather = len(df[(df.weather_type == prediction) & (df.weather_type_actual == weather)]) / p_weather
        
        p_weather_given_prediction = (p_prediction_given_weather * p_weather) / p_prediction
        probabilities[weather] = p_weather_given_prediction
        
    return probabilities
    
    
# forecast_for("Amsterdam", 5)

forecast = hourly[(hourly.city == 'Amsterdam') & (hourly.forecast_delta == 1)].iloc[0]
adjust_forecast(hourly, forecast)

{'Clear': 0.010309278350515464,
 'Cloudy': 0.010309278350515464,
 'Few Showers': 0.010309278350515464,
 'Light Rain': 0.020618556701030927,
 'Mostly Clear': 0.030927835051546393,
 'Mostly Cloudy': 0.09278350515463918,
 'Mostly Sunny': 0.030927835051546393,
 'Partly Cloudy': 0.7731958762886598,
 'Rain': 0.0,
 'Scattered T-Storms': 0.0,
 'Showers': 0.020618556701030927,
 'Sunny': 0.0}