<a href="https://colab.research.google.com/github/rmminusrslash/BanditsBook/blob/master/HowMuchDidItRain2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How much did it rain II: A timeseries prediction problem
https://www.kaggle.com/c/how-much-did-it-rain-ii/

# Data preprocessing

Preprocessing logic is the same as from the winning solution https://github.com/simaaron/kaggle-Rain/blob/master/data_preprocessing.py

In [0]:
use_preprocessed=False # Set to false to read and preprocess data from scratch
n_rows=None
dataset_name="preprocessed"

In [2]:
# upload data to google drive, folder data, then sign in via executing this cell
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [3]:
# confirm that the data can be read
! ls "/content/drive/My Drive/data/"
!mkdir "/content/drive/My Drive/data/preprocessed"

 kaggle.json	'sample_dask (1).py'   sample_solution.csv.zip	 train.csv
 median_impute	 sample_dask.py        submission_0.csv		 train_head.csv
 preprocessed	 sample_solution.csv   test.csv
mkdir: cannot create directory ‘/content/drive/My Drive/data/preprocessed’: File exists


In [0]:
import pandas as pd
import numpy as np

feature_cols = ['radardist_km', 'Ref', 'Ref_5x5_10th',
                'Ref_5x5_50th', 'Ref_5x5_90th', 'RefComposite', 'RefComposite_5x5_10th',
                'RefComposite_5x5_50th', 'RefComposite_5x5_90th', 'RhoHV',
                'RhoHV_5x5_10th', 'RhoHV_5x5_50th', 'RhoHV_5x5_90th', 'Zdr',
                'Zdr_5x5_10th', 'Zdr_5x5_50th', 'Zdr_5x5_90th', 'Kdp', 'Kdp_5x5_10th',
                'Kdp_5x5_50th', 'Kdp_5x5_90th']

def impute_missing(data, imputer=None):
    '''
    fills missing values with median, returns median imputer so that it can be used for inference on test and validation
    :param data:
    :return:
    '''
    if imputer is None:
        imputer = data[feature_cols].median()
        print("Median",imputer)

    data = data.fillna(imputer)
    return data, imputer

def prepare(alldata):
    # enumerate samples per gauge minute
    g = alldata.groupby('Id').cumcount()
    num_measurements_per_gauge = np.array(alldata.groupby('Id')["Id"].count().values)

    X = alldata.set_index([alldata.Id, g]) \
        .unstack(fill_value=0) \
        .stack()

    if "Expected" in alldata.columns:
      targets = alldata.groupby("Id")["Expected"] \
          .last().to_numpy()
      X=X[X.columns[1:-1]]
    else:
      targets=[]
      X=X[X.columns[1:]]
      
    # Series, one line per id
    X = np.array(X.groupby(level=0) \
                 .apply(lambda x: np.array(x.values))
                 .values
                 .tolist())

    return X, targets, num_measurements_per_gauge


def dropNa(train_df):
  print(len(train_df))
  train_ids = train_df[~np.isnan(train_df.Ref)].Id.unique()
  #print(len(train_ids))
  train_new = train_df[np.in1d(train_df.Id, train_ids)]
  print(len(train_new))
  del train_df, train_ids
  train_new.head()
  return train_new


In [5]:
%%time
if use_preprocessed:
  X_train=np.load(open("/content/drive/My Drive/data/%s/X_train" % dataset_name,"rb"))
  y_train=np.load(open("/content/drive/My Drive/data/%s/y_train"% dataset_name,"rb"))
  s=np.load(open("/content/drive/My Drive/data/%s/s"% dataset_name,"rb"))
else:
  print("read")
  alldata = pd.read_csv("/content/drive/My Drive/data/train.csv",nrows=n_rows)
  print(len(alldata))
  print("drop")
  alldata=dropNa(alldata)
  print("Thresholding")
  alldata = alldata[alldata['Expected'] < 73]
  print(len(alldata))
  print("Filling")
  alldata = alldata.fillna(0.0)
  print(alldata.isna().sum())
  X_train, y_train, s = prepare(alldata)
  print(len(alldata))
  del alldata
  print("Saving")
  np.save(open("/content/drive/My Drive/data/%s/X_train" % dataset_name,"wb"), X_train)
  np.save(open("/content/drive/My Drive/data/%s/y_train" % dataset_name,"wb"), y_train)
  np.save(open("/content/drive/My Drive/data/%s/s" % dataset_name,"wb"), s)


read
13765201
drop
13765201
9125329
Thresholding
8926102
Filling
Id                       0
minutes_past             0
radardist_km             0
Ref                      0
Ref_5x5_10th             0
Ref_5x5_50th             0
Ref_5x5_90th             0
RefComposite             0
RefComposite_5x5_10th    0
RefComposite_5x5_50th    0
RefComposite_5x5_90th    0
RhoHV                    0
RhoHV_5x5_10th           0
RhoHV_5x5_50th           0
RhoHV_5x5_90th           0
Zdr                      0
Zdr_5x5_10th             0
Zdr_5x5_50th             0
Zdr_5x5_90th             0
Kdp                      0
Kdp_5x5_10th             0
Kdp_5x5_50th             0
Kdp_5x5_90th             0
Expected                 0
dtype: int64
8926102
Saving
CPU times: user 5min 2s, sys: 16.5 s, total: 5min 19s
Wall time: 5min 40s


In [13]:
%%time
if use_preprocessed:
  X_test=np.load(open("/content/drive/My Drive/data/%s/X_test" % dataset_name,"rb"))
  y_test=np.load(open("/content/drive/My Drive/data/%s/y_test"% dataset_name,"rb"))
  s_test=np.load(open("/content/drive/My Drive/data/%s/s_test"% dataset_name,"rb"))
  test_ids=np.load(open("/content/drive/My Drive/data/%s/test_ids"% dataset_name,"rb"))
else:
  testdata=pd.read_csv("/content/drive/My Drive/data/test.csv",nrows=n_rows)
  test_ids=testdata.Id.unique()
  testdata = testdata.fillna(0.0)
  X_test, y_test, s_test = prepare(testdata)
  del testdata
  print("Saving")
  np.save(open("/content/drive/My Drive/data/%s/test_ids" % dataset_name,"wb"), test_ids)
  np.save(open("/content/drive/My Drive/data/%s/X_test" % dataset_name,"wb"), X_test)
  np.save(open("/content/drive/My Drive/data/%s/y_test" % dataset_name,"wb"), y_test)
  np.save(open("/content/drive/My Drive/data/%s/s_test" % dataset_name,"wb"), s_test)

KeyboardInterrupt: ignored

In [0]:
temp_x,temp_y,temp_s=  X_train, y_train, s  


In [16]:
import random

# for fun coded myself, in practice one should use existing utilities like sklearn
def val_index(train_size, seed=17,validation_percentage=10):
    val_idx = random.choices(list(range(0, 101)),k=validation_percentage)
    print(val_idx)
    return [i for i in range(train_size) if i%100 in val_idx]    

val_idx=val_index(len(temp_x))
X_val,y_val, s_val=temp_x[val_idx],temp_y[val_idx],temp_s[val_idx]
X_train,y_train, s_train=np.delete(temp_x,val_idx,axis=0),np.delete(temp_y,val_idx,axis=0),np.delete(temp_s,val_idx,axis=0)
X_train.shape,X_val.shape

[16, 69, 64, 48, 21, 80, 81, 51, 51, 23]


((650503, 19, 22), (64335, 19, 22))

# Model learning


In [0]:
%tensorflow_version 1.x


In [0]:
import random
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell

def reset_graph():
    np.random.seed(21)
    random.seed(21)
    tf.reset_default_graph()
    tf.random.set_random_seed(22)


class Model():

    def __init__(self):
        self.num_layers = 1
        self.num_steps = 19
        self.num_features = 22
        self.learn_rate = 1e-3

    def build(self):
        reset_graph()
        self.inputs = tf.placeholder(shape=[None, self.num_steps, self.num_features], dtype=tf.float32)  # batch_size x num_steps x num_features
        self.seq_lengths = tf.placeholder(shape=[None], dtype=tf.int32)
        self.targets = tf.placeholder(shape=[None], dtype=tf.float32)
        self.is_training = tf.placeholder_with_default(True, shape=(), name="is_training")

        cells = [tf.contrib.rnn.GRUCell(num_units=64, activation=tf.nn.relu, dtype= tf.float32) for _ in range(self.num_layers)]

        layers = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=tf.cond(self.is_training,
                                                                            lambda: 1.0, # deactivate to compare to keras solution
                                                                            lambda: 1.0)) for cell in cells]
        rnn = tf.contrib.rnn.MultiRNNCell(layers)
        
        self.outputs, self.final_state = tf.nn.dynamic_rnn(rnn, self.inputs,
                                                       sequence_length=self.seq_lengths,
                                                        dtype=tf.float32)  # 'final_state' is a tensor of shape [batch_size, cell_state_size]
        self.states_concat = tf.concat(axis=1, values=self.final_state)

        num_outputs = 1
        self.logits = tf.layers.dense(self.states_concat, num_outputs)  #final_state[-1].h if lstm cell is used
      
        
      
      
        self.loss =  tf.reduce_mean(tf.abs(tf.subtract(self.targets,self.logits))) # tf.subs

        self.train_op = tf.train.AdamOptimizer(self.learn_rate).minimize(self.loss)



In [24]:
m = Model()
m.build()

num_epochs=50
batch_size=1024
batch_size_val=np.minimum(2048,len(y_val))
with tf.Session() as sess:
        tf.random.set_random_seed(18)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        best_loss=1000
        no_improvement=0
        for e in range(num_epochs):
            print("Epoch ",e)
            train_loss=[]
            
            for i in range(0,int(len(y_train)/batch_size)+1):
            
              _, l = sess.run([m.train_op,m.loss], feed_dict={
                                                     m.inputs: X_train[i*batch_size:(i+1)*batch_size], 
                                                     m.targets: y_train[i*batch_size:(i+1)*batch_size], 
                                                     m.seq_lengths: s_train[i*batch_size:(i+1)*batch_size],
                                                   m.is_training: True})
              train_loss.append(l)

              if i % 100 == 0 and i >0:
                  val_losses=[]
                  for b in range(0,int(len(y_val)/batch_size_val)+1):
                    loss=sess.run([m.loss],feed_dict={m.inputs: X_val[b*batch_size_val:(b+1)*batch_size_val], 
                                                      m.targets: y_val[b*batch_size_val:(b+1)*batch_size_val], 
                                                      m.seq_lengths: s_val[b*batch_size_val:(b+1)*batch_size_val],
                                                      m.is_training: False})
                    val_losses.append(loss)
                  val_loss=np.average(val_losses)
                  print("Instance %s, train loss %.3f, val loss %.3f, best val loss %.3f"% ((i+1)*batch_size,np.average(train_loss[-100:]),val_loss,best_loss))
                  if val_loss<best_loss:
                    save_path = saver.save(sess, "/tmp/model.ckpt")
                    best_loss=val_loss
                    no_improvement=0
                  else:
                    no_improvement+=1
                  
                  if no_improvement>5:
                      break
            if no_improvement>5:
                      break


Epoch  0
Instance 103424, train loss 2.924, val loss 2.911, best val loss 1000.000
Instance 205824, train loss 3.061, val loss 2.886, best val loss 2.911
Instance 308224, train loss 2.787, val loss 2.897, best val loss 2.886
Instance 410624, train loss 2.827, val loss 2.885, best val loss 2.886
Instance 513024, train loss 2.803, val loss 2.893, best val loss 2.885
Instance 615424, train loss 2.982, val loss 2.909, best val loss 2.885
Epoch  1
Instance 103424, train loss 2.794, val loss 2.877, best val loss 2.885
Instance 205824, train loss 3.010, val loss 2.894, best val loss 2.877
Instance 308224, train loss 2.765, val loss 2.920, best val loss 2.877
Instance 410624, train loss 2.803, val loss 2.880, best val loss 2.877
Instance 513024, train loss 2.795, val loss 2.891, best val loss 2.877
Instance 615424, train loss 2.970, val loss 2.894, best val loss 2.877
Epoch  2
Instance 103424, train loss 2.792, val loss 2.868, best val loss 2.877
Instance 205824, train loss 3.006, val loss 2.8

In [25]:
#save submission
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess,"/tmp/model.ckpt")
    predictions=[]
    for b in range(0,int(len(X_test)/batch_size_val)+1):
      pred=sess.run(m.logits,feed_dict={m.inputs: X_test[b*batch_size_val:(b+1)*batch_size_val], 
                                                m.seq_lengths: s_test[b*batch_size_val:(b+1)*batch_size_val],
                                                m.is_training: False})
      predictions.extend(pred.reshape(-1))
      print(len(predictions),len(test_ids))
    
    submission_0 = pd.DataFrame({'Id': test_ids, 'Expected': predictions})
    submission_0.to_csv('/content/drive/My Drive/data/submission_0.csv', index=False)

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
2048 717625
4096 717625
6144 717625
8192 717625
10240 717625
12288 717625
14336 717625
16384 717625
18432 717625
20480 717625
22528 717625
24576 717625
26624 717625
28672 717625
30720 717625
32768 717625
34816 717625
36864 717625
38912 717625
40960 717625
43008 717625
45056 717625
47104 717625
49152 717625
51200 717625
53248 717625
55296 717625
57344 717625
59392 717625
61440 717625
63488 717625
65536 717625
67584 717625
69632 717625
71680 717625
73728 717625
75776 717625
77824 717625
79872 717625
81920 717625
83968 717625
86016 717625
88064 717625
90112 717625
92160 717625
94208 717625
96256 717625
98304 717625
100352 717625
102400 717625
104448 717625
106496 717625
108544 717625
110592 717625
112640 717625
114688 717625
116736 717625
118784 717625
120832 717625
122880 717625
124928 717625
126976 717625
129024 717625
131072 717625
133120 717625
135168 717625
137216 717625
139264 717625
141312 717625
143360 717625
145408 717625


# Simple Keras model

In [0]:
from keras.layers import Input, Dense, GRU,CuDNNLSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

tf.reset_default_graph()


BATCH_SIZE = 1024
N_EPOCHS = 50

def get_model_simple(shape=(19,22)):
    inp = Input(shape)
    x =  GRU(64, return_sequences=False,activation=tf.nn.relu)(inp)
    x = Dense(1)(x)
    model = Model(inp, x)
    return model


model_0 = get_model_simple((19,22))
model_0.compile(optimizer='adam', loss='mae')
model_0.summary()

In [0]:
es_callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)

model_0.fit(X_train, y_train, 
            batch_size=BATCH_SIZE, epochs=N_EPOCHS, 
            validation_data=(X_val,y_val), callbacks=[es_callback])


In [0]:
y_pred_0 = model_0.predict(X_test)
submission_0 = pd.DataFrame({'Id': test_ids, 'Expected': y_pred_0.reshape(-1)})
submission_0.to_csv('/content/drive/My Drive/data/submission_0.csv', index=False)

# Submission






In [0]:
! pip install kaggle --upgrade

In [26]:
! export KAGGLE_CONFIG_DIR='/content/drive/My Drive/data/'; kaggle competitions submit -c how-much-did-it-rain-ii -f '/content/drive/My Drive/data/submission_0.csv' -m "keras,simple Gru"

100% 17.6M/17.6M [00:00<00:00, 25.7MB/s]
Successfully submitted to How Much Did It Rain? II