In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data description
To understand the data, you have to realize that there are multiple radar observations over the course of an hour, and only one gauge observation (the 'Expected'). That is why there are multiple rows with the same 'Id'.  

The columns in the datasets are:  

Id:  A unique number for the set of observations over an hour at a gauge.  
minutes_past:  For each set of radar observations, the minutes past the top of the hour that the radar observations were carried out.  Radar observations are snapshots at that point in time.  
radardist_km:  Distance of gauge from the radar whose observations are being reported.  
Ref:  Radar reflectivity in km  
Ref_5x5_10th:   10th percentile of reflectivity values in 5x5 neighborhood around the gauge.  
Ref_5x5_50th:   50th percentile  
Ref_5x5_90th:   90th percentile  
RefComposite:  Maximum reflectivity in the vertical column above gauge.  In dBZ.  
RefComposite_5x5_10th  
RefComposite_5x5_50th  
RefComposite_5x5_90th  
RhoHV:  Correlation coefficient (unitless)  
RhoHV_5x5_10th  
RhoHV_5x5_50th  
RhoHV_5x5_90th  
Zdr:    Differential reflectivity in dB  
Zdr_5x5_10th  
Zdr_5x5_50th  
Zdr_5x5_90th  
Kdp:  Specific differential phase (deg/km)  
Kdp_5x5_10th  
Kdp_5x5_50th  
Kdp_5x5_90th  
Expected:  Actual gauge observation in mm at the end of the hour.

In [None]:
# Type declairation to minimize RAM consumption
col_list = ['minutes_past', 'radardist_km', 'Ref', 'Ref_5x5_10th',
       'Ref_5x5_50th', 'Ref_5x5_90th', 'RefComposite', 'RefComposite_5x5_10th',
       'RefComposite_5x5_50th', 'RefComposite_5x5_90th', 'RhoHV',
       'RhoHV_5x5_10th', 'RhoHV_5x5_50th', 'RhoHV_5x5_90th', 'Zdr',
       'Zdr_5x5_10th', 'Zdr_5x5_50th', 'Zdr_5x5_90th', 'Kdp', 'Kdp_5x5_10th',
       'Kdp_5x5_50th', 'Kdp_5x5_90th', 'Expected']
d = {c : np.float32 for c in col_list}

# EDA and Preprocessing

In [None]:
train = pd.read_csv("../input/how-much-did-it-rain-ii/train.zip", dtype=d)
train

In [None]:
train.keys()

Let's look at some samples

In [None]:
train.loc[train["Id"] == 862571]

In [None]:
train.loc[train["Id"] == 5]

We can see that most of the data is `NaN`, which indicates there is no data. However we can assume that if there is no data, these values can be zero. It is not true in most cases, since `0 dB` does not mean no sound: it only means the pressure of the measured data compares to the pressure of the reference point is equal. Source: https://www.animations.physics.unsw.edu.au/jw/dB.htm#definition

In [None]:
train.isna().sum()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
train.fillna(0, inplace=True)
train[["minutes_past", "radardist_km", "Expected"]].describe()

Let's calculate the correlation matrix

In [None]:
corr_mat = train.corr()
corr_mat.style.background_gradient(cmap='coolwarm')

In [None]:
import matplotlib.pyplot as plt

f = plt.figure(figsize=(10, 10))
plt.matshow(corr_mat, fignum=f.number)
plt.colorbar()

I think because I assume the `Nan` values are `0` so there is near zero correlation with `Expected`. 

In the `Expected` column, the mean is about `106`, but 75% of the data is lower than `3.8`, and the max is `33017`. Our data has a lot of outliers. Let's plot them~!

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
plt.scatter(np.arange(len(train["Expected"].unique())), train["Expected"].unique())

In [None]:
plt.figure(figsize=(15, 10))
plt.hist(train["Expected"].unique())

Now let's exclude the outliers. We will use `scipy.stats.precentileofscore` to calculate the percentile of a given number in the column. Let's try with `mean`.

In [None]:
from scipy import stats

print(stats.percentileofscore(train["Expected"], 106))

92% of the data is lower than `106`, which is about 1.279M rows, while our data has `13765201` rows. Let's exlude the outliers

In [None]:
train.drop(train[train["Expected"] >= 106].index, inplace=True)
train

In [None]:
plt.figure(figsize=(15, 10))
plt.scatter(np.arange(len(train["Expected"].unique())), train["Expected"].unique())

In [None]:
plt.figure(figsize=(15, 10))
plt.hist(train["Expected"].unique())

Our data looks good now. I think someone measured the outliers on stormy days.  
  
Now let's use Seaborn's `pairplot` to see the relations between variables, but let's select only specific values.

Now we begin processing the data into time series data. The time series is the `minutes_past` column.  
Thanks to the notebook: https://www.kaggle.com/andkul/deep-lstm-to-predict-rainfall

In [None]:
train_grouped = train.groupby('Id')
target = pd.DataFrame(train_grouped['Expected'].mean()) # mean, or any value, since they are the same in a group

In [None]:
target.reset_index(inplace=True)
target = target["Expected"]
target

In [None]:
def pad_series(X, target_len=19):
    seq_len = X.shape[0]
    pad_size = target_len-seq_len
    if (pad_size > 0):
        X = np.pad(X, ((0,pad_size), (0,0)), 'constant', constant_values=0.)
    return X, seq_len

In [None]:
INPUT_WIDTH = 19
data_size = len(train_grouped)
X_train = np.empty((data_size, INPUT_WIDTH, 22))
seq_lengths = np.zeros(data_size)
y_train = np.zeros(data_size)

i = 0
for _, group in train_grouped:
    X = group.values
    seq_len = X.shape[0]
    X_train[i,:seq_len,:] = X[:,1:23]
    y_train[i] = X[0,23]
    i += 1
    del X
    
del train_grouped

In [None]:
X_train.shape

Split the model into Train and Valid set

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(X_train, target, random_state=42, shuffle=True)

Convert into `tf.data.Dataset` to avoid Out of memory while training, and delete unused variables.

In [None]:
import tensorflow as tf

del X_train
del target

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
valid_data = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
train_data = train_data.batch(32)
valid_data = valid_data.batch(32)

del x_train
del x_valid
del y_train
del y_valid

# Create and train the Model

In [None]:
import tensorflow as tf

def create_model(shape=(19, 22)):
    tfkl = tf.keras.layers
    model = tf.keras.Sequential([
        tfkl.Bidirectional((tfkl.LSTM(128, return_sequences=True)), input_shape=shape),
        tfkl.Bidirectional(tfkl.LSTM(64)),
        tfkl.Dense(64, activation="linear"),
        tfkl.Dense(1, activation="linear")
    ])
    
    model.compile(loss='mean_absolute_error', optimizer="adam")
    return model

model = create_model()
model.summary()

In [None]:
model.fit(train_data, epochs=100, validation_data=valid_data,
          callbacks=[tf.keras.callbacks.ReduceLROnPlateau(), 
                    tf.keras.callbacks.EarlyStopping(patience = 10),
                    tf.keras.callbacks.ModelCheckpoint("model.h5", save_best_only=True)])

In [None]:
col_list.pop()
d = {c: np.float32 for c in col_list}

test = pd.read_csv("../input/how-much-did-it-rain-ii/test.zip", dtype=d)
test[test.columns[1:]] = test[test.columns[1:]].astype(np.float32)
test_ids = test['Id'].unique()

# Convert all NaNs to zero
test = test.reset_index(drop=True)
test.fillna(0.0, inplace=True)
test_groups = test.groupby("Id")
test_size = len(test_groups)

X_test = np.zeros((test_size, INPUT_WIDTH, 22), dtype=np.float32)

i = 0
for _, group in test_groups:
    X = group.values
    seq_len = X.shape[0]
    X_test[i,:seq_len,:] = X[:,1:23]
    i += 1
    del X
    
del test_groups
X_test.shape

In [None]:
submission = pd.read_csv("../input/how-much-did-it-rain-ii/sample_solution.csv.zip")
submission

# Make predictions

In [None]:
model.load_weights("model.h5")
predictions = model.predict(X_test, batch_size=32)
submission["Expected"] = predictions
submission.to_csv("submission.csv", index=False)

Base on the score, I could have gotten a silver medal! Sadly I came too late!