In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

In [None]:
def df_formating(df, station_to_drop=[]):    
    df['day'] = df.date.dt.dayofyear
    df['hour'] = df.date.dt.hour
    agg_func = {'dd':np.mean, 'ff':np.mean, 'precip':np.sum, 'hu':np.mean, 't':np.mean, 'td':np.mean}
    parameter = df.groupby(['number_sta','day','hour']).agg(agg_func)
    parameter['dd'].fillna(0)
    parameter['w_x'], parameter['w_y'] = formating_wind(parameter['dd'], parameter['ff'])
    parameter = parameter.drop(columns=['dd','ff'])
    inspect = parameter.isna().groupby(level=0).sum()
    if station_to_drop:
        to_drop = station_to_drop
    else:
        to_drop = inspect.index[inspect.sum(axis=1) != 0].to_list()
    parameter.drop(index= to_drop, level=0, inplace=True)
    return parameter, to_drop

In [None]:
def formating_wind(w_direction, w_intensity):
    w_direction.fillna(0,inplace=True)
    w_direction = np.deg2rad(90 - w_direction)
    w_x = np.cos(w_direction)*w_intensity
    w_y = np.sin(w_direction)*w_intensity
    return w_x, w_y

In [None]:
# station_position = df.groupby('number_sta').mean().loc[:,['lon','lat','height_sta']]
# station_position

In [None]:
# import matplotlib.pyplot as plt
# import cartopy.crs as ccrs
# import cartopy.feature as cfeature
# # Coordinates of studied area boundaries (in °N and °E)
# lllat = 46.25  #lower left latitude
# urlat = 51.896  #upper right latitude
# lllon = -5.842  #lower left longitude
# urlon = 2  #upper right longitude
# extent = [lllon, urlon, lllat, urlat]

# fig = plt.figure(figsize=(9,5))

# # Select projection
# ax = plt.axes(projection=ccrs.PlateCarree())

# # Plot the data
# cond = inspect.sum(axis= 1) != 0
# plt.scatter(station_position['lon'].loc[cond], station_position['lat'].loc[cond])

# # Add coastlines and borders
# ax.coastlines(resolution='50m', linewidth=1)
# ax.add_feature(cfeature.BORDERS.with_scale('50m'))

# # Adjust the plot to the area we defined 
# #/!\# this line causes a bug of the kaggle notebook and clears all the memory. That is why this line is commented and so
# # the plot is not completely adjusted to the data
# # Show only the area we defined
# ax.set_extent(extent)

# plt.show()

In [None]:
def df_to_images(parameter):    
    images=np.empty((365*24,4,13,6))
    images[:] = np.nan
    n_sta = len(parameter.index.unique(level=0))
    print(n_sta)
    na = []
    for i, station in enumerate(parameter.index.unique(level=0)):
        for day in parameter.index.unique(level=1):
            for hour in parameter.index.unique(level=2):
                try:
                    images[hour*(day-1), int(i/13), i%13,:] = parameter.loc[station, day, hour]
                except:
                    na += [(i, day, hour)]
    return images

In [None]:
def deal_with_na(images):    
    images_copy = images.copy()
    value_modified = 0
    empty_slice = 0
    for n in range(365*24):
        for i in range(4):
            for j in range(13):
                for k in range(6):
                    if np.isnan(np.nanmean(images_copy[n,:,:,k])):
                        images_copy[n,:,:,k] = images_copy[n-1,:,:,k]
                        empty_slice += 1/6
                    elif np.isnan(images_copy[n,i,j,k]):
                        images_copy[n,i,j,k] = np.nanmean(images_copy[n,:,:,k])
                        value_modified += 1

    print((value_modified/(365*24*4*13*6))*100)
    print((empty_slice/(365*24*52))*100)
    return images_copy

In [None]:
# rep_nan = {}
# for index in na:
#     if str(index[1])+' '+str(index[2]) not in rep_nan:
#         rep_nan[str(index[1])+' '+str(index[2])] = 1
#     else:
#         rep_nan[str(index[1])+' '+str(index[2])] += 1
# for key, value in rep_nan.items():
#     if value == 52:
#         print(key)

In [None]:
zone, year, param = 'NW', '2016', 'hu'
fname = '/kaggle/input/meteonet/'+zone+'_Ground_Stations/'+zone+'_Ground_Stations/'+zone+'_Ground_Stations_'+year+".csv"
data2016 = pd.read_csv(fname,parse_dates=[4],infer_datetime_format=True)
year = '2017'
fname = '/kaggle/input/meteonet/'+zone+'_Ground_Stations/'+zone+'_Ground_Stations/'+zone+'_Ground_Stations_'+year+".csv"
data2017 = pd.read_csv(fname,parse_dates=[4],infer_datetime_format=True)

In [None]:
data2016, station_to_drop = df_formating(data2016)
images2016 = df_to_images(data2016)
images2016 = deal_with_na(images2016)
print('2016: done')
np.save('/kaggle/working/images2016.npy', images2016)

data2017, _ = df_formating(data2017, station_to_drop)
images2017 = df_to_images(data2017)
images2017 = deal_with_na(images2017)
print('2017: done')
np.save('/kaggle/working/images2017.npy', images2017)

In [None]:
# images2016 = np.load('/kaggle/input/images2016.npy')
# images2017 = np.load('/kaggle/input/images2017.npy')

# Model Building

In [None]:
n_filters = 64
kernel_size = (3,3)
dropout=0.2
recurrent_dropout=0.2
input_horizon = 24

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=(input_horizon,4,13,6)),
    tf.keras.layers.ConvLSTM2D(n_filters, kernel_size, padding='same', dropout=dropout, recurrent_dropout=recurrent_dropout),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(52*6, activation='relu')
], name='Nostradamus')

loss_fn = tf.keras.losses.MSE
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

model.summary()

# Model Training

In [None]:
batch_size = 16
epochs = 10
train_x, train_y, val_x, val_y = [], [], [], []
for i in range(330*24):
    train_x.append(images2016[i:i+input_horizon,:,:,:])
    train_y.append(images2016[i+input_horizon,:,:,:].flatten())
for i in range(330*24 + 24,363*24):
    val_x.append(images2016[i:i+input_horizon,:,:,:])
    val_y.append(images2016[i+input_horizon,:,:,:].flatten())
train_x = np.array(train_x)
train_y = np.array(train_y)
val_x = np.array(val_x)
val_y = np.array(val_y)
history = model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_x, val_y))

In [None]:
def anticipate(model, input_data, output_horizon):
    output = []
    for i in range(output_horizon):
        next_hour = model.predict(input_data)
        output += [next_hour]
        input_data[:-1] = input_data[1:]
        input_data[-1] = next_hour
    return output

In [None]:
# output_horizon = 24
# test_x, test_y = [], []
# for i in range(300*24):
#     test_x.append(images2017[i:i+input_horizon,:,:,:])
#     y = []
#     for j in range(output_horizon):
#         y += images2017[i+input_horizon+j,:,:,:].flatten()
#     test_y.append(y)

In [None]:
# next_hours_error = np.zeros((100,24))
# for i in range(100):
#     out = anticipate(model, test_x[i], output_horizon)