In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['figure.figsize'] = (10,8)

import warnings
warnings.filterwarnings('ignore')

In [None]:
stock_df = pd.read_csv("../input/nse-stocks-data/FINAL_FROM_DF.csv")
stock_df.head()

## Data Preprocessing

In [None]:
stock_df.columns

In [None]:
# convert TIMESTAMP to datetime instance
stock_df['TIMESTAMP'] = pd.to_datetime(stock_df.TIMESTAMP, dayfirst = True)
stock_df.set_index('TIMESTAMP', inplace = True)
stock_df['PER_DAILY_CHANGE'] = (stock_df.CLOSE - stock_df.OPEN)*100/stock_df.OPEN
stock_df.head()

In [None]:
#Lets check how many different symbols are present in the dataset
unique_symb = stock_df['SYMBOL'].unique()
print("total number of unique symbols is ", len(unique_symb))

#let's print top 10 different symbols
unique_symb[:10]

In [None]:
# total number of datapoints
stock_df.shape

In [None]:
#say if we are interested in predicting prices of Banks only
bank_stock = stock_df[stock_df['SYMBOL'].str.contains("BANK")]
bank_stock.head()

In [None]:
#We need to define the columns data we are interested in predicting,
# say we want to predict the closing price of all the companies
#lets plot a pivot table

stock_pivot = bank_stock.pivot_table(values = 'CLOSE', columns = 'SYMBOL', index = 'TIMESTAMP')
stock_pivot.head()

In [None]:
stock_pivot.shape

In [None]:
#let's drop those columns in which there are missing values
stock_pivot.dropna(axis = 1, how = 'any', inplace = True)
stock_pivot.head()

In [None]:
stock_pivot.shape

In [None]:
#Randomly choose 5 columns and plot its value
# col = stock_pivot.columns[np.random.randint(0, len(stock_pivot.columns)+1, size = 5)]


In [None]:
#most important thing in time series analysis is to have correlation between the values at different timestamp
#choose one bank at random and look for correlation using lag plot with lag value 1
np.random.seed(1)
bank = stock_pivot.columns.values[np.random.randint(0, len(stock_pivot.columns)+1)]
print("Bank is = ", bank)
pd.plotting.lag_plot(stock_pivot[bank])
plt.show()

As we can see in the above plot that there is strong correlation bewteen the datapoints. So, we are good to go with time series analysis on the above data

In [None]:
stock_pivot.plot(kind ='line', y = bank)
plt.show()

In [None]:
# #check for correlation using heatmap
# plt.figure(figsize = (14,14))
# sns.heatmap(stock_pivot.corr(method='pearson'),annot=True, linewidth = 1)

The most important thing in autocorrelation analysis is to know the appropriate lag value. we can manually do that. Let's do that.

In [None]:
# find appropriate lag value for the autocorrelation analysis
MAX_LAG = 30 #maximum lag value
corr = 0
lag = 1
for i in range(1, MAX_LAG):
    temp = np.abs(stock_pivot[bank].corr(stock_pivot[bank].shift(i)))
    if temp > corr:
        corr = temp
        lag = i

print("apprpriate lag value is ", lag)
    


In [None]:
#Let's cross check our value using pandas autocorrelation plot
pd.plotting.autocorrelation_plot(stock_pivot[bank])
plt.show()

We can see in the above plot that the autocorrelation corresponding to lag value 1 is maximum

## Check for Trend and seasonality

__Note__ that we have assumed our model to be additive in general

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
#seasonal decomposition using period of 1
decomposed = seasonal_decompose(stock_pivot[bank], model='additive', period = 1)

x = decomposed.plot()

Though our dataset does not exhibit trend and seasonality, But it's still a good practice to make model stationary by differencing

In [None]:
# remove seasonality and trend
df = stock_pivot.copy()
df['STATIONARY'] = df[bank].diff()
df.STATIONARY

In [None]:
# Recheck for seasonality and trend
df.dropna(axis = 0, inplace = True)
decomposed = seasonal_decompose(df['STATIONARY'], model='additive', period = 1)
x = decomposed.plot()

## AR(1) model

In [None]:
#divide data in train and test
freq = 7
X = df['STATIONARY']
Y =  X.shift(1).dropna()
Y, X
train_size = int(len(X)*0.8)
# train, test = X[1: len(X)-freq], X[len(X)-freq : ]
# train.shape, test.shape
x_train, x_test = X[0:train_size], X[train_size:len(X)-1]
y_train, y_test = Y[0:train_size], Y[train_size:]
x_train.shape, y_train.shape

In [None]:
#AR1 equation is given by
#x_t = ϕ_{t-1} + ϵ

#find ϕ using OLS (ordinary least square)
phi = np.dot(x_train.values,y_train.values.T)/np.dot(x_train, x_train.T)
print(phi)

In [None]:
#check the prediction over test data
pred = phi*x_test
pred

In [None]:
y_test

In [None]:
#finding mean square error
import sklearn.metrics as m
r2 =m.r2_score(y_test, pred)
print("R square is ", r2)

In [None]:
#let's plot the predicted vs test value
plt.plot(y_test)
plt.plot(pred)
plt.show()

As it can be seen in the above plot that we are nearly predicting the exact value as it should. it makes sense as we can see in the correlation plot that all the data points are perfectly cluttered along the diagonal.