Let's inspect each of the id's for auto-correlation and stationarity

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, acf, pacf

In [None]:
with pd.HDFStore("../input/train.h5", "r") as train:
    df = train.get("train")

In [None]:
instruments = sorted(df['id'].unique())

In [None]:
target_df = df[df['id'] == ids[0]][['timestamp', 'y']].set_index('timestamp')

## Stationarity ##
Do any of the instrument id's display non-stationarity? I'll use the dickey-fuller test to check. If the test statistic is > than the 5% critical value I'll say the data might be non-stationary.

In [None]:
def is_stationary(data):
    dftest = adfuller(data)
    
    # dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    # for k,v in dftest[4].items():
    #    dfoutput['Critical Value ({})'.format(k)] = v
    # print(dfoutput)
    
    if dftest[0] > dftest[4]['5%']:
        return False
    
    # print(dftest[0], 'is less than', dftest[4]['5%'])
    return True


In [None]:
nonstationary_ids = []
for id in ids:
    target_df = df[df['id'] == id][['timestamp', 'y']].set_index('timestamp')

In [None]:
x = is_stationary(target_df['y'].values)

## visualize a single id's target variable ##

In [None]:
target_df.plot()

a bit hard to tell but it looks like there might be some "seasonality" here, lets add the rolling mean and standard deviation

In [None]:
plt.plot(pd.rolling_mean(target_df, window=10), label='rolling mean (10)')
plt.plot(pd.rolling_std(target_df, window=10), label='rolling std (10)')
plt.legend()

## Check for auto-correlation ##

In [None]:
print(plot_acf(target_df, lags=50, alpha=.05))

In [None]:
print(plot_pacf(target_df, lags=50))

In [None]:
x = acf(target_df, alpha=.05)
x

Hmmm....nothing too interesting here.

## Check for stationarity using dickey-fuller ##

In [None]:
dftest = adfuller(target_df['y'].values)
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for k,v in dftest[4].items():
    dfoutput['Critical Value ({})'.format(k)] = v
print(dfoutput)

Wow...the data is certainly stationary.