## Imports

In [0]:
import math
import numpy as np
import pandas as pd
from pylab import plt

plt.style.use('seaborn')
%matplotlib inline

## Data Preprocessing

In [0]:
%%time
colab_path = "https://github.com/poornagurram/TimeSeriesAnalysis_ODSC_2019/blob/master/"
df = pd.read_csv(colab_path+'data/yesbank_data.csv', index_col=0, parse_dates=True)

In [0]:
df.rename(columns={'close': 'c', 'open':'o', 'high':'h', 'low':'l', 'volume':'v'}, inplace=True)

In [0]:
df.head()

In [0]:
df['c'].plot()

## Implementing a simple crossover strategy

In [0]:
df['r'] = np.log(df['c'] / df['c'].shift(1))
df['sma1'] = df['c'].rolling(15).mean()
df['sma2'] = df['c'].rolling(30).mean()
df['sma3'] = df['c'].rolling(60).mean()

In [0]:
df.dropna(inplace=True)

In [0]:
df['market_dir'] = np.where(df['r'] > 0, 1, -1)

In [0]:
df['strat_dir'] = np.where(df['sma1'] > df['sma3'], 1, -1)

In [0]:
df.head(10)

In [0]:
df['strat_dir'].diff().value_counts()

In [0]:
df['s'] = df['strat_dir'] * df['r']

In [0]:
df[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 10));

## Add a machine learning model and perform accuracy testing

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [0]:
mdf = df.copy()

In [0]:
for i in range(1,6):
     mdf[f'lag_{i}'] = mdf['market_dir'].shift(i)

In [0]:
mdf.dropna(inplace=True)
mdf.head()

In [0]:
X = mdf[['lag_1','lag_2', 'lag_3', 'lag_4','lag_5']]

In [0]:
y = mdf['market_dir']

In [0]:
X_train = X[:500]
X_test = X[500:]
y_train = y[:500]
y_test = y[500:]

In [0]:
classifier = LogisticRegression()

In [0]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

In [0]:
classifier.fit(X_train,y_train)

## In Sample testing

In [0]:
y_pred = classifier.predict(X_train)
accuracy_score(y_pred, y_train)

## Holdout Testing

In [0]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)