<a href="https://colab.research.google.com/github/rmminusrslash/notebooks/blob/master/HowMuchDidItRain2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How much did it rain II: A timeseries prediction problem
https://www.kaggle.com/c/how-much-did-it-rain-ii/

# Data preprocessing

Preprocessing logic is the same as from the winning solution https://github.com/simaaron/kaggle-Rain/blob/master/data_preprocessing.py

In [0]:
use_preprocessed=True # Set to false to read and preprocess data from scratch
n_rows=None
dataset_name="preprocessed"

In [5]:
# upload data to google drive, folder data, then sign in via executing this cell
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [6]:
# confirm that the data can be read
! ls "/content/drive/My Drive/data/"
! mkdir "/content/drive/My Drive/data/preprocessed"

 kaggle.json	       sample_dask.py		 test.csv
 preprocessed	       sample_solution.csv	 train.csv
'preprocessed (1)'     sample_solution.csv.zip	 train_head.csv
'sample_dask (1).py'   submission_0.csv
mkdir: cannot create directory ‘/content/drive/My Drive/data/preprocessed’: File exists


In [0]:
import pandas as pd
import numpy as np

feature_cols = ['radardist_km', 'Ref', 'Ref_5x5_10th',
                'Ref_5x5_50th', 'Ref_5x5_90th', 'RefComposite', 'RefComposite_5x5_10th',
                'RefComposite_5x5_50th', 'RefComposite_5x5_90th', 'RhoHV',
                'RhoHV_5x5_10th', 'RhoHV_5x5_50th', 'RhoHV_5x5_90th', 'Zdr',
                'Zdr_5x5_10th', 'Zdr_5x5_50th', 'Zdr_5x5_90th', 'Kdp', 'Kdp_5x5_10th',
                'Kdp_5x5_50th', 'Kdp_5x5_90th']

def impute_missing(data, imputer=None):
    '''
    fills missing values with median, returns median imputer so that it can be used for inference on test and validation
    :param data:
    :return:
    '''
    if imputer is None:
        imputer = data[feature_cols].median()
        print("Median",imputer)

    data = data.fillna(imputer)
    return data, imputer

def prepare(alldata):
    # enumerate samples per gauge minute
    g = alldata.groupby('Id').cumcount()
    num_measurements_per_gauge = np.array(alldata.groupby('Id')["Id"].count().values)

    X = alldata.set_index([alldata.Id, g]) \
        .unstack(fill_value=0) \
        .stack()

    if "Expected" in alldata.columns:
      targets = alldata.groupby("Id")["Expected"] \
          .last().to_numpy()
      X=X[X.columns[1:-1]]
    else:
      targets=[]
      X=X[X.columns[1:]]

    print("final format")  
    # Series, one line per id
    X = np.array(X.groupby(level=0) \
                 .apply(lambda x: np.array(x.values))
                 .values
                 .tolist())

    assert len(X)==len(num_measurements_per_gauge)
    assert (len(targets)==len(X) or len(targets)==0)
    return X, targets, num_measurements_per_gauge


def dropNa(train_df):
  print(len(train_df))
  train_ids = train_df[~np.isnan(train_df.Ref)].Id.unique()
  #print(len(train_ids))
  train_new = train_df[np.in1d(train_df.Id, train_ids)]
  print(len(train_new))
  del train_df, train_ids
  train_new.head()
  return train_new


In [8]:
%%time
if use_preprocessed:
  X_train=np.load(open("/content/drive/My Drive/data/%s/X_train" % dataset_name,"rb"))
  y_train=np.load(open("/content/drive/My Drive/data/%s/y_train"% dataset_name,"rb"))
  s=np.load(open("/content/drive/My Drive/data/%s/s"% dataset_name,"rb"))
  assert len(X_train)==len(y_train)==len(s)
else:
  print("read")
  alldata = pd.read_csv("/content/drive/My Drive/data/train.csv",nrows=n_rows)
  print(len(alldata))
  print("drop")
  alldata=dropNa(alldata)
  print("Thresholding")
  alldata = alldata[alldata['Expected'] < 73]
  print(len(alldata))
  print("Filling")
  alldata = alldata.fillna(0.0)
  print(alldata.isna().sum())
  X_train, y_train, s = prepare(alldata)
  print(len(alldata))
  del alldata
  print("Saving")
  np.save(open("/content/drive/My Drive/data/%s/X_train" % dataset_name,"wb"), X_train)
  np.save(open("/content/drive/My Drive/data/%s/y_train" % dataset_name,"wb"), y_train)
  np.save(open("/content/drive/My Drive/data/%s/s" % dataset_name,"wb"), s)


CPU times: user 26.7 ms, sys: 1.16 s, total: 1.19 s
Wall time: 2.51 s


In [9]:
%%time
if use_preprocessed:
  X_test=np.load(open("/content/drive/My Drive/data/%s/X_test" % dataset_name,"rb"))
  y_test=np.load(open("/content/drive/My Drive/data/%s/y_test"% dataset_name,"rb"))
  s_test=np.load(open("/content/drive/My Drive/data/%s/s_test"% dataset_name,"rb"))
  test_ids=np.load(open("/content/drive/My Drive/data/%s/test_ids"% dataset_name,"rb"))
  assert len(X_test)==len(s_test)==len(test_ids)
else:
  testdata=pd.read_csv("/content/drive/My Drive/data/test.csv",nrows=n_rows)
  test_ids=testdata.Id.unique()
  testdata = testdata.fillna(0.0)
  X_test, y_test, s_test = prepare(testdata)
  del testdata
  print("Saving")
  np.save(open("/content/drive/My Drive/data/%s/test_ids" % dataset_name,"wb"), test_ids)
  np.save(open("/content/drive/My Drive/data/%s/X_test" % dataset_name,"wb"), X_test)
  np.save(open("/content/drive/My Drive/data/%s/y_test" % dataset_name,"wb"), y_test)
  np.save(open("/content/drive/My Drive/data/%s/s_test" % dataset_name,"wb"), s_test)

CPU times: user 2.53 ms, sys: 1.11 s, total: 1.11 s
Wall time: 2.44 s


In [0]:
temp_x,temp_y,temp_s=  X_train, y_train, s  

In [11]:
import random

# for fun coded myself, in practice one should use existing utilities like sklearn
def val_index(train_size, seed=17,validation_percentage=10):
    val_idx = random.choices(list(range(0, 101)),k=validation_percentage)
    print(val_idx)
    return [i for i in range(train_size) if i%100 in val_idx]    

val_idx=val_index(len(temp_x))
X_val,y_val, s_val=temp_x[val_idx],temp_y[val_idx],temp_s[val_idx]
X_train,y_train, s_train=np.delete(temp_x,val_idx,axis=0),np.delete(temp_y,val_idx,axis=0),np.delete(temp_s,val_idx,axis=0)
X_train.shape,X_val.shape

[77, 84, 70, 2, 50, 69, 81, 27, 88, 96]


((643356, 19, 22), (71482, 19, 22))

# Model learning


In [0]:
%tensorflow_version 1.x

In [0]:
import random
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell

def reset_graph():
    np.random.seed(21)
    random.seed(21)
    tf.reset_default_graph()
    tf.random.set_random_seed(22)


class Model():

    def __init__(self,ff):
        self.num_layers = 1
        self.num_steps = 19
        self.num_features = 22
        self.learn_rate = 1e-3
        self.ff=ff

    def build(self):
        reset_graph()
        self.inputs = tf.placeholder(shape=[None, self.num_steps, self.num_features], dtype=tf.float32)  # batch_size x num_steps x num_features
        self.seq_lengths = tf.placeholder(shape=[None], dtype=tf.int32)
        self.targets = tf.placeholder(shape=[None], dtype=tf.float32)
        self.is_training = tf.placeholder_with_default(True, shape=(), name="is_training")

        num_outputs = 1

        if self.ff:
          self.flattened_inputs=tf.concat(values=self.inputs, axis=0, name="flatten")
          print(self.flattened_inputs.shape)
          l1=tf.layers.dense(self.flattened_inputs, 512, activation=tf.nn.relu )
          l2=tf.layers.dense(l1, 200 ,activation=tf.nn.relu)
          self.logits = tf.layers.dense(l2, num_outputs,activation=None)  #final_state[-1].h if lstm cell is used
        else:
          cells = tf.contrib.rnn.GRUCell(num_units=64, activation=tf.nn.relu, dtype= tf.float32) 
          self.outputs, self.final_state = tf.nn.dynamic_rnn(cells, self.inputs,
                                                      sequence_length=self.seq_lengths,
                                                       dtype=tf.float32)  # 'final_state' is a tensor of shape [batch_size, cell_state_size]
          self.logits = tf.layers.dense(self.final_state, num_outputs,activation=None)  #final_state[-1].h if lstm cell is used

        self.diff=tf.abs(tf.subtract(self.targets,tf.reshape(self.logits,[-1])))
        self.loss =  tf.reduce_mean(self.diff) 

        self.train_op = tf.train.AdamOptimizer(self.learn_rate).minimize(self.loss)



In [19]:
m = Model(ff=False)
m.build()

num_epochs=500
batch_size=1024
batch_size_val=np.minimum(2048,len(y_val)//2)
with tf.Session() as sess:
        tf.random.set_random_seed(18)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        best_loss=1000
        no_improvement=0
        for e in range(num_epochs):
            print("Epoch ",e)
            train_loss=[]
            ids=list(range(len(X_train)))
            np.random.shuffle(ids)
            X_train=X_train[ids]
            y_train=y_train[ids]
            s_train=s_train[ids]
            
            for i in range(0,int(len(y_train)/batch_size)):
            
              _, l = sess.run([m.train_op,m.loss], feed_dict={
                                                     m.inputs: X_train[i*batch_size:(i+1)*batch_size], 
                                                     #m.inputs: X_train[:100], 
                                                     m.targets: y_train[i*batch_size:(i+1)*batch_size], 
                                                     m.seq_lengths: s_train[i*batch_size:(i+1)*batch_size],
                                                   m.is_training: True})
              train_loss.append(l)
              if i % 100 == 0 and i >0:
                  val_losses=[]
                  for b in range(0,int(len(y_val)/batch_size_val)):
                    loss=sess.run([m.loss],feed_dict={m.inputs: X_val[b*batch_size_val:(b+1)*batch_size_val], 
                                                      m.targets: y_val[b*batch_size_val:(b+1)*batch_size_val], 
                                                      m.seq_lengths: s_val[b*batch_size_val:(b+1)*batch_size_val],
                                                      m.is_training: False})
                    val_losses.append(loss)
                  val_loss=np.average(val_losses)
                  print("Instance %s, train loss %.3f, val loss %.3f, best val loss %.3f"% ((i+1)*batch_size,np.average(train_loss[-100:]),val_loss,best_loss))

                  if val_loss<best_loss:
                    save_path = saver.save(sess, "/tmp/model.ckpt")
                    best_loss=val_loss
                    no_improvement=0
                  else:
                    no_improvement+=1
                  
                  if no_improvement>5:
                      break
            if no_improvement>5:
                      break


Epoch  0
Instance 103424, train loss 2.880, val loss 2.607, best val loss 1000.000
Instance 205824, train loss 2.585, val loss 2.517, best val loss 2.607
Instance 308224, train loss 2.490, val loss 2.465, best val loss 2.517
Instance 410624, train loss 2.449, val loss 2.469, best val loss 2.465
Instance 513024, train loss 2.458, val loss 2.431, best val loss 2.465
Instance 615424, train loss 2.455, val loss 2.455, best val loss 2.431
Epoch  1
Instance 103424, train loss 2.432, val loss 2.417, best val loss 2.431
Instance 205824, train loss 2.428, val loss 2.412, best val loss 2.417
Instance 308224, train loss 2.415, val loss 2.407, best val loss 2.412
Instance 410624, train loss 2.426, val loss 2.396, best val loss 2.407
Instance 513024, train loss 2.403, val loss 2.404, best val loss 2.396
Instance 615424, train loss 2.393, val loss 2.394, best val loss 2.396
Epoch  2
Instance 103424, train loss 2.371, val loss 2.389, best val loss 2.394
Instance 205824, train loss 2.398, val loss 2.4

In [20]:
#save submission
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess,"/tmp/model.ckpt")
    predictions=[]
    for b in range(0,int(len(X_test)/batch_size_val)+1):
      pred=sess.run(m.logits,feed_dict={m.inputs: X_test[b*batch_size_val:(b+1)*batch_size_val], 
                                                m.seq_lengths: s_test[b*batch_size_val:(b+1)*batch_size_val],
                                                m.is_training: False})
      predictions.extend(pred.reshape(-1))
      print(len(predictions),len(test_ids))
    
    submission_0 = pd.DataFrame({'Id': test_ids, 'Expected': predictions})
    submission_0.to_csv('/content/drive/My Drive/data/submission_0.csv', index=False)

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
2048 717625
4096 717625
6144 717625
8192 717625
10240 717625
12288 717625
14336 717625
16384 717625
18432 717625
20480 717625
22528 717625
24576 717625
26624 717625
28672 717625
30720 717625
32768 717625
34816 717625
36864 717625
38912 717625
40960 717625
43008 717625
45056 717625
47104 717625
49152 717625
51200 717625
53248 717625
55296 717625
57344 717625
59392 717625
61440 717625
63488 717625
65536 717625
67584 717625
69632 717625
71680 717625
73728 717625
75776 717625
77824 717625
79872 717625
81920 717625
83968 717625
86016 717625
88064 717625
90112 717625
92160 717625
94208 717625
96256 717625
98304 717625
100352 717625
102400 717625
104448 717625
106496 717625
108544 717625
110592 717625
112640 717625
114688 717625
116736 717625
118784 717625
120832 717625
122880 717625
124928 717625
126976 717625
129024 717625
131072 717625
133120 717625
135168 717625
137216 717625
139264 717625
141312 717625
143360 717625
145408 717625


# Simple Keras model

In [15]:
from keras.layers import Input, Dense, GRU,CuDNNLSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

tf.reset_default_graph()

BATCH_SIZE = 1024
N_EPOCHS = 50

def get_model_simple(shape=(19,22)):
    inp = Input(shape)
    x =  GRU(64, return_sequences=False,activation=tf.nn.relu)(inp)
    x = Dense(1)(x)
    model = Model(inp, x)
    return model


model_0 = get_model_simple((None,22))
model_0.compile(optimizer='adam', loss='mae')
model_0.summary()





Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 22)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                16704     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 16,769
Trainable params: 16,769
Non-trainable params: 0
_________________________________________________________________


In [21]:
es_callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)

model_0.fit(X_train, y_train, 
            batch_size=BATCH_SIZE, epochs=N_EPOCHS, 
            validation_data=(X_val,y_val), callbacks=[es_callback])


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 643356 samples, validate on 71482 samples
Epoch 1/50





Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


<keras.callbacks.History at 0x7f6b28d25f98>

In [0]:
y_pred_0 = model_0.predict(X_test)
submission_0 = pd.DataFrame({'Id': test_ids, 'Expected': y_pred_0.reshape(-1)})
submission_0.to_csv('/content/drive/My Drive/data/submission_keras.csv', index=False)

In [28]:
print(list(zip(y_val[:10],pred[:10].reshape(-1))))

[(0.50800025, 0.6755886), (0.7620004, 0.8104479), (4.0640019999999994, 1.8330603), (1.2700007, 3.3346477), (0.50800025, 0.25874883), (1.5240008, 0.5065918), (2.0320009999999997, 0.33316383), (1.0160004999999999, 0.68877053), (0.50800025, 1.2990577), (0.25400013, 1.0149037)]


# Submission






In [0]:
! pip install kaggle --upgrade

In [0]:
! export KAGGLE_CONFIG_DIR='/content/drive/My Drive/data/'; kaggle competitions submit -c how-much-did-it-rain-ii -f '/content/drive/My Drive/data/submission_keras.csv' -m "keras,simple Gru"

100% 17.6M/17.6M [00:00<00:00, 25.7MB/s]
Successfully submitted to How Much Did It Rain? II

In [26]:
! export KAGGLE_CONFIG_DIR='/content/drive/My Drive/data/'; kaggle competitions submit -c how-much-did-it-rain-ii -f '/content/drive/My Drive/data/submission_0.csv' -m "olf plain tf,simple Gru"

100% 17.7M/17.7M [00:06<00:00, 2.69MB/s]
Successfully submitted to How Much Did It Rain? II