In [1]:
import gru
import data_utils
import utils
import pandas as pd
import random

In [2]:
data = pd.DataFrame(index=range(10))
data['ID'] = [100]*5 + [200]*5
data['TS'] = sorted(random.sample(range(10),5)) + sorted(random.sample(range(10),5))
data = pd.concat([data, pd.DataFrame(pd.np.random.randint(0, 100, size=(10, 3)), columns=['x1','x2','y'])], axis=1)

data

Unnamed: 0,ID,TS,x1,x2,y
0,100,0,12,78,33
1,100,2,91,45,93
2,100,3,90,81,16
3,100,4,35,69,63
4,100,6,65,61,13
5,200,0,29,76,4
6,200,3,68,7,94
7,200,5,78,55,89
8,200,8,21,88,7
9,200,9,74,66,85


### Pad data with missing TS entries

In [3]:
padded_data = data_utils.pad_missing_months(data)
padded_data.head(10)

### Get missing mask and missing delta

In [4]:
m, t = data_utils.missingness_indicators(padded_data, prefixes=['m','t'])
pd.concat([m,t], axis=1)

Unnamed: 0,m_ID,m_TS,m_x1,m_x2,m_y,t_ID,t_TS,t_x1,t_x2,t_y
0,1,1,1,1,1,1,1,1,1,1
1,1,1,0,0,0,1,1,2,2,2
2,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1
5,1,1,0,0,0,1,1,2,2,2
6,1,1,1,1,1,1,1,1,1,1
7,1,1,1,1,1,1,1,1,1,1
8,1,1,0,0,0,1,1,2,2,2
9,1,1,0,0,0,1,1,3,3,3


### Impute missing values

In [5]:
imputed_data = padded_data.groupby('ID').apply(data_utils.Imputer().forward)
imputed_data

Unnamed: 0,ID,TS,x1,x2,y
0,100.0,0.0,12.0,78.0,33.0
1,100.0,1.0,12.0,78.0,33.0
2,100.0,2.0,91.0,45.0,93.0
3,100.0,3.0,90.0,81.0,16.0
4,100.0,4.0,35.0,69.0,63.0
5,100.0,5.0,35.0,69.0,63.0
6,100.0,6.0,65.0,61.0,13.0
7,200.0,0.0,29.0,76.0,4.0
8,200.0,1.0,29.0,76.0,4.0
9,200.0,2.0,29.0,76.0,4.0


### Last observed

In [6]:
x_obs = data_utils.x_last_observed(imputed_data)
x_obs

Unnamed: 0,LO_ID,LO_TS,LO_x1,LO_x2,LO_y
0,100.0,0.0,12.0,78.0,33.0
1,100.0,0.0,12.0,78.0,33.0
2,100.0,1.0,12.0,78.0,33.0
3,100.0,2.0,91.0,45.0,93.0
4,100.0,3.0,90.0,81.0,16.0
5,100.0,4.0,35.0,69.0,63.0
6,100.0,5.0,35.0,69.0,63.0
7,100.0,6.0,65.0,61.0,13.0
8,200.0,0.0,29.0,76.0,4.0
9,200.0,1.0,29.0,76.0,4.0


### Put it all together

In [30]:
input = pd.concat([imputed_data, m, t, x_obs], axis=1)
input = input[[x for x in input.columns if x[-2:]!='_y']]
tgt = input['y']
del input['y']

input['y'] = tgt
input

Unnamed: 0,ID,TS,x1,x2,m_ID,m_TS,m_x1,m_x2,t_ID,t_TS,t_x1,t_x2,LO_ID,LO_TS,LO_x1,LO_x2,y
0,100.0,0.0,12.0,78.0,1,1,1,1,1,1,1,1,100.0,0.0,12.0,78.0,33.0
1,100.0,1.0,12.0,78.0,1,1,0,0,1,1,2,2,100.0,0.0,12.0,78.0,33.0
2,100.0,2.0,91.0,45.0,1,1,1,1,1,1,1,1,100.0,1.0,12.0,78.0,93.0
3,100.0,3.0,90.0,81.0,1,1,1,1,1,1,1,1,100.0,2.0,91.0,45.0,16.0
4,100.0,4.0,35.0,69.0,1,1,1,1,1,1,1,1,100.0,3.0,90.0,81.0,63.0
5,100.0,5.0,35.0,69.0,1,1,0,0,1,1,2,2,100.0,4.0,35.0,69.0,63.0
6,100.0,6.0,65.0,61.0,1,1,1,1,1,1,1,1,100.0,5.0,35.0,69.0,13.0
7,200.0,0.0,29.0,76.0,1,1,1,1,1,1,1,1,100.0,6.0,65.0,61.0,4.0
8,200.0,1.0,29.0,76.0,1,1,0,0,1,1,2,2,200.0,0.0,29.0,76.0,4.0
9,200.0,2.0,29.0,76.0,1,1,0,0,1,1,3,3,200.0,1.0,29.0,76.0,4.0
