In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r'..\Time Series Analysis\Index2018.csv')

In [3]:
data_copy = data.copy()

In [4]:
## finding length of the time period
data_copy.date.describe()

count           6269
unique          6269
top       01/05/2006
freq               1
Name: date, dtype: object

### Converting to date

In [5]:

data_copy.date = pd.to_datetime(data_copy.date,dayfirst = True)

In [6]:
data_copy.head()

Unnamed: 0,date,spx,dax,ftse,nikkei
0,1994-01-07,469.9,2224.95,3445.98,18124.01
1,1994-01-10,475.27,2225.0,3440.58,18443.44
2,1994-01-11,474.13,2228.1,3413.77,18485.25
3,1994-01-12,474.17,2182.06,3372.02,18793.88
4,1994-01-13,472.47,2142.37,3360.01,18577.26


### Setting the index


In [7]:
data_copy.set_index('date',inplace = True)

In [8]:
data_copy.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


### Setting the frequency. This will help in setting daily, yearly, weekly frequency. This also generates new data points. New data points might be generated without a value.

In [11]:
data_copy = data_copy.asfreq('b') ## Daily frequency. Skips non business days.

In [12]:
data_copy.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


### Handling missing values in the dataframe

In [20]:
data_copy.isna().sum()

spx       0
dax       0
ftse      0
nikkei    0
dtype: int64

In [15]:
## Front filling / Back filling for NAs
data_copy.spx = data_copy.spx.fillna('ffill')

In [16]:
data_copy.ftse = data_copy.ftse.fillna('bfill')

In [19]:
data_copy.dax = data_copy.dax.fillna(np.mean(data_copy.dax))
data_copy.nikkei = data_copy.nikkei.fillna(np.mean(data_copy.nikkei))

In [21]:
data_copy['market_value'] = data_copy.spx

In [25]:
data_copy.drop(['spx','nikkei','dax'],axis = 1,inplace  = True)

In [26]:
data_copy.describe()

Unnamed: 0,ftse,market_value
count,6277,6277
unique,6016,6030
top,bfill,ffill
freq,8,8


### Train test split for time series data. Shuffeling is not a good choice when working with time series. We split the data to a specific point and keep it as a training set and the rest as the test set.

In [27]:
train_size = int(len(data_copy)*0.8)

In [28]:
train_size

5021

In [32]:
test_set = data_copy.iloc[train_size:]
train_set = data_copy.iloc[:train_size]

In [33]:
len(train_set)

5021