# Create weather data set for UI

Inputs:
* destinations
* university of delaware
* visualcrossing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [None]:
data_dir = '../../../data/'

df_places = pd.read_csv(data_dir + 'wikivoyage/enriched/wikivoyage_destinations.csv')
df_temp   = pd.read_csv(data_dir + 'delaware/clean/delaware_monthly_temp.csv')
df_rain   = pd.read_csv(data_dir + 'delaware/clean/delaware_monthly_rain.csv')
df_vc     = pd.read_csv(data_dir + 'visualcrossing/visualcrossing_monthly_weather.csv')

path_out  = data_dir + 'delaware/processed/delaware_monthly_weather.csv'

## Query a location and compare

Let's start with comparing temperature and rainfall for one location.

In [None]:
# choose location
place_name = 'Amsterdam'

lat, lng = df_places.loc[lambda df: df['name'] == place_name][['lat', 'lng']].values.flatten().tolist()
lat, lng

For the uDelaware data we have 3 options:
1. Find the closest lat/lng point and get those weather readings; or
2. Simply take mean of the closest points that are within a 0.5 lat/lng degree distance; or
3. Take a weighted mean based on the distance to the points that are within a 0.5 lat/lng degree distance. As described [here](https://math.stackexchange.com/questions/1968091/weight-of-a-point-based-on-its-distance-from-other-weighted-points).

Whereas the first is relatively easiest to set up, the downside is that many places will have the same weather data. I expect that option 2 will be more accurate, and option 3 even more. For example when there is a steep hill climb between 2 of those 0.5 lat/lng degree points. Let's therefore try to compare options 2 and 3 based on MAE.

In [None]:
from stairway.utils.utils import vectorized_haversine

In [None]:
def weighted_average(x, feature):
    # check if distance == 0 for one place, else compute weighted average 
    if 0 in x['distance'].values:
        result = x[feature][x['distance'] == 0].values[0]
    else:
        result = sum(x[feature] / x['distance']) / sum(1 / x['distance'])
    return result
    
def weighted_average_df(x, feature):
    d = {}
    d[feature] = weighted_average(x, feature)
    return pd.Series(d, index=[feature])

def lookup_monthly_weather_data(df_place, df_weather, method):
    """Lookup monthly delaware temperature or rain data for a single place record."""
    if len(df_place) > 1:
        raise Exception('df_place should just be a single record.')

    lat, lng = df_place[['lat', 'lng']].values.flatten().tolist()
    stairway_id = df_place['id'].values[0]
    feature = 'temp' if 'temp' in df_weather.columns else 'precip'
    
    # select points within 0.5 degree distance
    df_weather = df_weather.loc[lambda df: (abs(df['lat'] - lat) <= 0.5) & (abs(df['lon'] - lng) <= 0.5)]
    
    # aggregate on month
    if method == 'mean':
        df_weather = df_weather.groupby('month').agg({feature: 'mean'})
    elif method == 'weighted_mean':
        df_weather = (
            df_weather
            .assign(lat_place=lat, lng_place=lng)
            .assign(distance = lambda x: vectorized_haversine(x['lat'], x['lon'], x['lat_place'], x['lng_place']))
            .groupby('month')
            .apply(weighted_average_df, feature)
        )
    else:
        raise Exception('method should be "mean" or "weighted_mean".')

    return df_weather.assign(stairway_id = stairway_id, lat=lat, lng=lng).reset_index()


### Option 2: mean

In [None]:
lookup_monthly_weather_data(df_places.loc[lambda df: df['name'] == place_name], df_temp, method='mean')

### Option 3: weighted mean

In [None]:
lookup_monthly_weather_data(df_places.loc[lambda df: df['name'] == place_name], df_temp, method='weighted_mean')

## Compare with Visual Crossing

Compare destinations and determine an error margin

First create the entire dataframe with weather data for all our destinations, for calculation options.

In [None]:
# calculate monthly weather data for df_places
df_weather_mean = (
    df_places
#     .head(10)
    .groupby('id')
    .apply(lookup_monthly_weather_data, df_weather=df_temp, method='mean')
    .reset_index(drop=True)
).merge(
    df_places
#     .head(10)
    .groupby('id')
    .apply(lookup_monthly_weather_data, df_weather=df_rain, method='mean')
    .reset_index(drop=True)
    , on=['month', 'stairway_id', 'lat', 'lng']
)

In [None]:
# calculate monthly weather data for df_places
df_weather_weightedmean = (
    df_places
#     .head(10)
    .groupby('id')
    .apply(lookup_monthly_weather_data, df_weather=df_temp, method='weighted_mean')
    .reset_index(drop=True)
).merge(
    df_places
#     .head(10)
    .groupby('id')
    .apply(lookup_monthly_weather_data, df_weather=df_rain, method='weighted_mean')
    .reset_index(drop=True)
    , on=['month', 'stairway_id', 'lat', 'lng']
)

Prep visual crossing data

In [None]:
months_map = {'Jan': 0, "Feb": 1, "Mar": 2, "Apr": 3, "May": 4, 'Jun': 5, 'Jul' : 6,
              'Aug': 7, 'Sep': 8, 'Oct': 9, 'Nov': 10, 'Dec': 11}

# only compare where we have all 12 months
complete_ids = (
    df_vc
    .groupby('stairway_id', as_index=False)
    .agg(count = pd.NamedAgg(column='stairway_id', aggfunc='count'))
    .loc[lambda df: df['count'] == 12]
    ['stairway_id'].values
)

# subset and map the string months to an integer column
df_weather_vc = (
    df_vc
    .loc[lambda df: df['stairway_id'].isin(complete_ids)]
    [['stairway_id', 'period', 'temp', 'precip']]
    .assign(month = lambda df: df['period'].map(months_map))
    .assign(precip = lambda df: df['precip'] / 10)  # mm to cm
    .reset_index(drop=True)
)

Join and calculate differences

In [None]:
df_diff_mean = (
    df_weather_mean
    .merge(df_weather_vc, on=['stairway_id', 'month'], suffixes=["_dw", "_vc"])
    .assign(diff_temp = lambda df: df['temp_dw'] - df['temp_vc'],
            diff_rain = lambda df: df['precip_dw'] - df['precip_vc'],
           )
)

df_diff_weightedmean = (
    df_weather_weightedmean
    .merge(df_weather_vc, on=['stairway_id', 'month'], suffixes=["_dw", "_vc"])
    .assign(diff_temp = lambda df: df['temp_dw'] - df['temp_vc'],
            diff_rain = lambda df: df['precip_dw'] - df['precip_vc'],
           )
)

print(df_diff_mean.shape, df_diff_weightedmean.shape)
# df_diff_weightedmean.head()

In [None]:
print('Option 2: mean')
print('MAE temp:', round(mean_absolute_error(df_diff_mean['temp_dw'], df_diff_mean['temp_vc']), 2))
print('MAE rain:', round(mean_absolute_error(df_diff_mean['precip_dw'], df_diff_mean['precip_vc']), 2))

print('Option 3: weighted mean')
print('MAE temp:', round(mean_absolute_error(df_diff_weightedmean['temp_dw'], df_diff_weightedmean['temp_vc']), 2))
print('MAE rain:', round(mean_absolute_error(df_diff_weightedmean['precip_dw'], df_diff_weightedmean['precip_vc']), 2))

Overall MAE on each monthly record turns out to be not so quite differrent for both metrics:

```
Option 2: mean
MAE temp: 1.49
MAE rain: 5.4

Option 3: weighted mean
MAE temp: 1.44
MAE rain: 5.38
```

We can also calculate a MAE per place. And then look into how those are distributed.

The distribution are very very much alike, so not very interesting. 

In [None]:
def mae_weather(x):
    d = {}
    d['mae_temp'] = mean_absolute_error(x['temp_dw'], x['temp_vc'])
    d['mae_precip'] = mean_absolute_error(x['precip_dw'], x['precip_vc'])
    return pd.Series(d, index=['mae_temp', 'mae_precip'])

df_diff_mean_per_place = (
    df_diff_mean
    .groupby('stairway_id')
    .apply(mae_weather)
)

df_diff_weightedmean_per_place = (
    df_diff_weightedmean
    .groupby('stairway_id')
    .apply(mae_weather)
)

Examine some of the top failures:

In [None]:
df_diff_weightedmean_per_place.sort_values('mae_temp', ascending=False).head()

Visualize a specific place

In [None]:
ID = 287629
df_diff = df_diff_weightedmean

fig, axes = plt.subplots(1, 2, figsize=(16,6))

df_plot = df_diff.loc[lambda df: df['stairway_id'] == ID].set_index('month')
place_description = ", ".join(df_places.loc[lambda df: df['id'] == ID][['name', 'country']].values.flatten().tolist())

for weather_type, ax in zip(['temp', 'precip'], [row for row in axes]):
    
    df_plot[[f"{weather_type}_dw", f"{weather_type}_vc"]].plot(ax = ax)
    MAE = mean_absolute_error(df_plot[f"{weather_type}_dw"], df_plot[f"{weather_type}_vc"])
    ax.set_title(f"{weather_type}, MAE: {round(MAE, 2)}", size=13)

fig.suptitle(f"{place_description}", size=15);
# fig.tight_layout()

## Write to file

Ultimately, we have to choose one dataset ('weighted' or 'weighted_mean') and write it to file.

In [None]:
df_weather_weightedmean.drop(['lat', 'lng'], axis=1).to_csv(path_out, index=False)

Done.