# Basic modeling

## Sources

- Paper: Tracking the Indian Summer Monsoon Onset Back to the Preinstrument Period
- 

## Dependencies

In [1]:
import time, os, math
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib as plt
import seaborn as sns
from keras.models import Model, Sequential
from keras.layers import LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import paths
from TRMM import TRMM
import BASE_MODEL

# force autoreload of external modules on save
%load_ext autoreload
%autoreload 2
%matplotlib inline

Using TensorFlow backend.


## Preparations

In [2]:
onset_dates, onset_ts = BASE_MODEL.load_onset_dates()

onset_dates.head()

Unnamed: 0,date,timestamp
0,1887-05-30,
1,1888-05-20,
2,1889-05-26,
3,1890-06-03,
4,1902-05-28,


In [3]:
onset_ts

date
1970      12261600
1972      77234400
1973     107215200
1974     138405600
1975     170546400
1976     201823200
1977     233359200
1978     264376800
1979     297727200
1980     328312800
1981     359589600
1982     391903200
1983     423871200
1984     454543200
1985     485560800
1986     518392800
1987     549583200
1988     580860000
1989     611964000
1990     642722400
1991     675640800
1992     707608800
1993     738799200
1994     769903200
1995     802130400
1996     833752800
1997     865807200
1998     896479200
1999     926460000
2000     958255200
2001     990136800
2002    1022104800
2003    1054764000
2004    1084399200
2005    1118181600
2006    1148162400
2007    1181340000
2008    1211666400
2009    1243029600
2010    1275256800
2011    1306360800
2012    1339365600
2013    1369173600
2014    1402005600
2015    1433455200
2016    1465336800
Name: timestamp, dtype: int64

## Combine TRMM with onset dates

In [5]:
YEARS = range(1998, 2017)
PRE_MONSOON = [3, 4, 5, 6] # load june too as some monsoons only start then!

def filter_fun(df, year):
    return BASE_MODEL.filter_until(df, onset_ts[year])

df = TRMM.load_dataset(YEARS, PRE_MONSOON, filter_fun=filter_fun, aggregated=True)

> Processing: 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 169 entries, (7.625, 64.875) to (37.625, 94.875)
Columns: 1686 entries, 888706800 to 1465336800
dtypes: float64(1686)
memory usage: 2.2 MB


In [7]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,888706800,888793200,888879600,888966000,889052400,889138800,889225200,889311600,889398000,889484400,...,1464559200,1464645600,1464732000,1464818400,1464904800,1464991200,1465077600,1465164000,1465250400,1465336800
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
7.625,64.875,0.27,0.09,0.0,0.42,0.0,13.02,0.0,27.119999,0.12,0.0,...,5.1,0.69,0.0,7.86,25.74,74.249999,144.929998,22.379999,40.859999,68.339998
7.625,67.375,0.03,10.08,0.0,2.43,0.0,26.46,0.03,3.48,0.0,0.03,...,13.53,15.179999,5.58,16.47,6.3,306.179991,507.419988,47.939998,137.639995,74.099997
7.625,69.875,20.159999,4.56,0.0,64.409999,0.0,13.589999,0.0,4.11,0.0,0.72,...,22.049999,42.659998,6.24,12.72,8.43,174.779995,420.749989,110.939996,86.219996,43.709999
7.625,72.375,4.02,0.09,8.43,109.139997,16.92,115.829996,0.0,0.57,0.0,0.0,...,124.049998,221.759995,114.899997,47.969998,10.86,7.59,191.699994,823.379981,190.739997,152.999996
7.625,74.875,2.16,0.0,22.559999,66.629998,70.619998,41.849998,0.0,0.0,0.0,6.99,...,954.419981,283.559996,316.319993,284.669993,17.729999,8.64,25.349999,378.209993,993.779983,654.089992


## Train / Dev / Test

In [9]:
YEARS_TRAIN = range(1998, 2015)
YEARS_TEST = range(2015, 2017)

X_train, X_test, y_train, y_test = BASE_MODEL.train_test_split(df, None, onset_ts, years_train=YEARS_TRAIN, years_test=YEARS_TEST)

ValueError: too many values to unpack (expected 3)

In [None]:
test[2015]

In [None]:
X_train = [train[year][0] for year in YEARS_TRAIN]
X_test = [test[year][0] for year in YEARS_TEST]

y_train = np.concatenate([train[year][1] for year in YEARS_TRAIN])
y_test = np.concatenate([test[year][1] for year in YEARS_TEST])

X_train[0].shape

## Sequence preprocessing

In [None]:
X_train_full = BASE_MODEL.concatenate_matrices(X_train)
X_test_full = BASE_MODEL.concatenate_matrices(X_test)

In [None]:
X_train_full

In [None]:
X_train_full.shape

## Building a model

In [None]:
dense_model = BASE_MODEL.build_dense_net(X_train_full, y_train, epochs=100)

In [None]:
dense_pred = dense_model.predict(X_test_full)

In [None]:
dense_pred[:10]

## LSTM

In [None]:
lstm_model = BASE_MODEL.build_lstm_net(X_train_full, y_train, epochs=5000)

In [None]:
X_test_reshaped = np.hstack(X_test_full).reshape(len(X_test_full), 1, 130)
lstm_pred = lstm_model.predict(X_test_reshaped)

In [None]:
lstm_pred[:10]

In [None]:
y_test[:10]