In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from statsmodels.tsa.vector_ar.var_model import VAR
import seaborn as sns
import sklearn
from sklearn.metrics import r2_score

<h3 style="text-align: center;"> Time series analysis and prediction </h3>
<img src="https://www.seebiz.eu/img/394f9846c06e08fbb7e116315e63f909.jpg" width="400" height= "400"> 

<h3>Data investigation</h3>  

In [None]:
data= pd.read_csv('../input/google-stock-prediction/GOOG.csv')

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.head(10)

In [None]:
data['data']= pd.to_datetime(data['date'])


In [None]:
dates=[]
for i in range(len(data)):
    dates.append(data['date'].iloc[i][0:10]) 

In [None]:
data

In [None]:
data.drop(['symbol'], axis= 1, inplace= True)

In [None]:
data['date']= pd.to_datetime(dates)


## checking the duplicated stime stamps:

In [None]:
data['date'].duplicated().sum()


## Finding missting time stamps and filling them using interpolation:

In [None]:
r = pd.date_range(start=data.date.min(), end=data.date.max())
data= data.set_index('date').reindex(r).fillna(np.nan).rename_axis('date').reset_index()

In [None]:
data.head(10)

In [None]:
data.drop(['date', 'data'], axis=1, inplace= True)
cl_names= list(data.columns)

In [None]:
cl_names

## Interpolating the NAN values:

In [None]:
for i in cl_names:
    data[i]= data[i].interpolate()

In [None]:
data.head(10)

### Correlation:

In [None]:
corr = data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0)

## Stationary check:
stationay check is done using Dicky fuller test.

In [None]:
from statsmodels.tsa.stattools import adfuller
series1= data['open']
result = adfuller(series1)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
 
print('**********************')
series2= data['close']
result = adfuller(series2)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

    

The results of test shows that data is non-stationary.

<h3> Data Visualization: <\3h>

In [None]:
# I am visualizing the target values 
plt.figure(figsize=(15, 5))
plt.subplot(1,2,1)
plt.plot(data['close']) 
plt.title('close values')
plt.xlabel('close')
plt.subplot(1,2,2)
plt.plot(data['open'])
plt.title('open values')
plt.xlabel('open')

### train- test split:

In [None]:
train = data.iloc[0:int(0.8*len(data)), :]
test= data.iloc[int(0.8*len(data)):, :]


In [None]:
test.shape

In [None]:
test

In [None]:
# Droping the constant values as they will be trouble some for Var modeling 
train.drop(['divCash', 'splitFactor'], axis=1, inplace= True)
test.drop(['divCash', 'splitFactor'], axis=1, inplace= True)



## Model definition and Training:

In [None]:
def VarForecasting(Actual):
        model = VAR(Actual)
        model_fit = model.fit()
        prediction = model_fit.forecast(model_fit.y, steps=10) # predicting all the next 10 values at each step
        return np.array(prediction)

In [None]:
data.drop(['divCash', 'splitFactor'], axis=1, inplace= True)

In [None]:
close_predictions=[]
open_predictions=[]
for timepoint in range(0, len(test)):
    Actual_train = data.iloc[timepoint:timepoint + len(train)]
    Prediction = VarForecasting(Actual_train)
    close_predictions.append(Prediction[:,0])
    open_predictions.append(Prediction[:,3])




In [None]:
close_predictions[0]

In [None]:
np.array(close_predictions).shape

In [None]:
test.shape

## Prediction visualization and evaluation 

One chunck of actual data and corresponding model prediction is plotted to visualize the model performance.

In [None]:
# at each step, the te
p_close= [close_predictions[i][0] for i in range(len(close_predictions))]
p_open= [open_predictions[i][0] for i in range(len(open_predictions))]

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1,2,1)
plt.plot(np.array(test['close'])[0:100])
plt.plot(p_close[0:100],'r')
plt.title('close values')
plt.xlabel('close')
plt.subplot(1,2,2)
plt.plot(np.array(test['open'])[0:100])
plt.plot(p_open[0:100], 'r' )
plt.title('open values')
plt.xlabel('open')

In [None]:
from sklearn.metrics import mean_squared_error
import math 
print(math.sqrt(mean_squared_error(np.array(test['open']),p_open )))
print(math.sqrt(mean_squared_error(np.array(test['close']),p_close )))

## R2 score:
print('R2 score of the Close series',r2_score(np.array(test['close']),p_close ))
print('R2 score of the open series',r2_score(np.array(test['open']),p_close ))