# Time Series Forecasting 
## Part 1: Getting Started with ML models

In [3]:
# import relevant panckages 
import pandas as pd
import numpy as np

# Financial data time-series
import quandl

# Plotting parameters
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16,6)

## (Down)Load Time Series Data 

If you do not have the time series, download it. Else just load it from the `data/` directory. 

In [None]:
## ----- First Time Download of Data ----------- ##
tickers = ["AAPL", "MSFT", "GOOG", "AMZN", "GM", "GE"]
series = "Close"
# Available options: Open, High, Low, Close, Volume, Adj. Open, Adj. High, Adj. Low, Adj. Close, Adj. Volume

df_price = pd.DataFrame() 

for stck in tickers:
    data = quandl.Dataset("WIKI/{}".format(stck))\
                 .data(params={"start_date":"2002-01-01", "end_date":"2018-07-01"})\
                 .to_pandas()[[series]]\
                 .rename(columns={series:stck})
    df_price = pd.concat([df_price, data], axis=1)

# Make sure there are no dates without closing prices    
df_price = df_price.dropna()

#  Save data 
df_price.to_pickle("data/df_price.pickle")




In [4]:
## ----- Alternative: Load data from Memory ----------- ##
df_price = pd.read_pickle("data/df_price.pickle")
df_price.head()

Unnamed: 0_level_0,AAPL,MSFT,GOOG,AMZN,GM,GE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-03-27,537.46,39.36,558.46,338.47,34.51,25.81
2014-03-28,536.86,40.3,559.99,338.29,34.73,25.88
2014-03-31,536.74,40.99,556.97,336.365,34.42,25.89
2014-04-01,541.65,41.42,567.16,342.99,34.34,25.87
2014-04-02,542.55,41.35,567.0,341.96,34.88,26.04


In [None]:
# Visualise Data 
df_price.plot(title="Daily closing prices")
plt.show()

df_price.diff().plot(title="Daily returns")
plt.show()

## Reorganise Data
### Create Label-Feature Pairs

In [None]:
look_ahead = 3 # days
window = 2 # days 
n = len(df_price)

# create label-feature-pairs 
label_feature_pair = []
for stck in df_price:
    timeseries = df_price[stck]
    
    for i in range(n - look_ahead - window+1):
       
        t = (timeseries.index[i+window+look_ahead-1], ) # date 
        
        y_t = (timeseries.iloc[i+window+look_ahead-1], ) # target price
        X_t = tuple(timeseries.iloc[i:i+window])[::-1] # list of features (i.e. previous prices)
        
        label_feature_pair.append(t + (stck,)+  y_t + X_t)

In [None]:
## Transform into a DataFrame

In [None]:
lagged_cols = ['t-{}'.format(i) for i in range(look_ahead, look_ahead + window)]

data = pd.DataFrame.from_records(label_feature_pair, 
                                 columns=['date', 'stock', 'target'] + lagged_cols, 
                                 index='date')


In [None]:
data.head()

## Fit Machine Learning  Models

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

### Split data into Training and Testing sets 

In [None]:
# Get indices for training and evalutaion sets
train_set, test_set = next(GroupShuffleSplit(n_splits=1, test_size=0.1)\
                                         .split(data, groups=data['stock']))

features =[i for i in data.columns if i not in {'target'}]
label = ['target', 'stock']

y_train, X_train = data.iloc[train_set][label], data.iloc[train_set][features]
y_test, X_test = data.iloc[test_set][label], data.iloc[test_set][features]


### Fit machine learning models

In [None]:
# Elastic Net
model_enet = ElasticNet(fit_intercept=False)
model_enet.fit(X_train.iloc[:, 1:], y_train.iloc[:, 0:1])

# # Gradient Boosting
model_gbr = GradientBoostingRegressor()
model_gbr.fit(X_train.iloc[:, 1:], y_train['target'])

# # Random Forests
model_rnd_frst = RandomForestRegressor()
model_rnd_frst.fit(X_train.iloc[:, 1:], y_train['target'])

### Assess Test set performance

In [None]:
for stck in set(X_test['stock']):
    
    feats = X_test.query("stock == '{}'".format(stck)).iloc[:, 1:]
    labels = y_test.query("stock == '{}'".format(stck))[['target']]
    
    plot_y =  pd.DataFrame({'ElasticNet': model_enet.predict(feats),
                           'RandomForest': model_rnd_frst.predict(feats)},  
                           index=feats.index)\
               .join(labels) 
    
    plot_y.plot(title=stck)
#     plt.xlim(['2017-06-01', '2018-03-01'])
    plt.show()


In [None]:
from sklearn.metrics import r2_score

In [None]:
for model in plot_y:
    print(model, r2_score(y_pred=plot_y[model], y_true=plot_y['target']))

In [6]:
# Temp plotting

In [29]:
df_price.resample('MS').mean().rolling(5).mean().to_csv('../../6_Confectionary_BD/sample_time_series.csv')
# plt.show()