In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Dataset

In [None]:
#Reading the Bar-S datafile
Bar = pd.read_csv('../input/stock-market-small-wide-dataset/bar-S.csv')


In [None]:
#Analysing the columns of Bar File
Bar.head()
Bar = Bar.drop(['epoch_time_at_the_beginning','epoch_time_at_the_ending'],axis=1)

**Open** is the price of the stock at the beginning of the trading day (it need not be the closing price of the previous trading day), 
**high** is the highest price of the stock on that trading day, 
**low** the lowest price of the stock on that trading day, and 
**close** the price of the stock at closing time. 
**Volume** indicates how many stocks were traded. 
**Adjusted prices (such as the adjusted close)** is the price of the stock that adjusts the price for corporate actions. While stock prices are considered to be set mostly by traders

In [None]:
Bar.tail()

In [None]:
Bar.dtypes

In [None]:
Bar.describe()

In [None]:
Bar.shape

# **Considering only the first 1000 rows of the data**

In [None]:
Bar = Bar.head(1000)
Bar['changeduringday'] = ((Bar['high_price'] - Bar['low_price'] )/ Bar['low_price'])*100

Bar['changefrompreviousday'] = (abs(Bar['close_price'].shift() - Bar['close_price'] )/ Bar['close_price'])*100

Bar.head()

In [None]:
import matplotlib.pyplot as plt  #IMporting Data Visualiztion Library
#This line is necessary for the plot to appear in notebook
%matplotlib inline
#Controlling Default size of figures in the notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
Bar[['time','average_price']].plot(grid=True)
plt.title("Average Price VS Time")
plt.xlabel('time')
plt.ylabel('average price')
plt.show()

In [None]:
Bar.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
Bar[['time','close_price']].plot(grid=True)
plt.xlabel('time')
plt.ylabel('price')
plt.legend('close_price')
plt.title('Close Price Vs Time')
plt.show()

Building the correlation matirix to know the correlation bewtween close price(target) and the other features

In [None]:
corr_matrix = Bar.corr()
corr_matrix['close_price'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["high_price", "low_price", "open_price", "changefrompreviousday", "changeduringday", "volume"]

scatter_matrix(Bar[attributes], figsize=(20, 15))

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
corr = Bar[["high_price", "low_price", "open_price", "changefrompreviousday", "changeduringday", "volume"]].corr()

# generate a mask for the lower triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 12))

# generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, 
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax);

# Stock Clustering using K-Means

In [None]:
from sklearn.cluster import KMeans   #importing the Kmeans module

#Taking only price columns
Bar_modified = Bar[['symbol','open_price','close_price','average_price']]

#Dropping duplicates symbol from Bar
Bar_modified.drop_duplicates(subset={'symbol'},keep='first')

#Stting Index of dataframe to symbol
Bar_modified = Bar_modified.set_index('symbol')
print(Bar_modified.head())





In order to determine the optimal number of clusters k for the Bar dataset, we will fit different models of the K-means algorithm while varying the k parameter in the range 2 to 10. For each model we calculate the Sum Squared Error (SSE) by using the inertia_ method of the model fitted. In each iteration we append the inertia to the sse list. Then we take the model with the less value of SSE. (Inertia tells how far away the points within a cluster are. The small the inertia value is better.)

In [None]:
#Storing values of price in different columns
X = Bar_modified.values
sse = []
for k in range(2,10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    
    #SSE for each cluster
    sse.append(kmeans.inertia_)
    
plt.plot(range(2,10),sse,'bx-')
plt.title('Elbow Curve')
plt.xlabel("Values of k")
plt.ylabel('Distortion')
plt.show()

**According to the Elbow Curve, we choose k = 4**

In [None]:
kmeans = KMeans(n_clusters = 4).fit(X)
centroids = kmeans.cluster_centers_
plt.scatter(X[:,0],X[:,1],X[:,2],c = kmeans.labels_,cmap='rainbow')
plt.title("Cluster of Dataset with n=4")
plt.show()

In [None]:
avgorder = Bar_modified.sort_values('close_price',ascending=False)
first_symbol = avgorder.index[0]
Bar_modified.drop(first_symbol,inplace=True)
X= Bar_modified.values
kmeans = KMeans(n_clusters=4).fit(X)
centroids = kmeans.cluster_centers_
plt.scatter(X[:,0],X[:,1],c = kmeans.labels_,cmap='rainbow')
plt.title('Clusters of Dataset without outliers k =4')
plt.show()

Finally we will assign to each stock it correspondent number of cluster(1,2,3,and 4) and make a dataframe with this information. Having the information of cluster number for each stock, we can create a diversified portfolio in the long term, between stocks from different clusters.

In [None]:
stocks = pd.DataFrame(Bar_modified.index)
cluster_labels = pd.DataFrame(kmeans.labels_)
stockClusters = pd.concat([stocks,cluster_labels],axis=1)
stockClusters.columns = ['symbol','Cluster']
print(stockClusters)
print(stockClusters.Cluster.unique()) #All the four clusters in the array


# Loading Quote-S Dataset

In [None]:
Quote = pd.read_csv('../input/stock-market-small-wide-dataset/quote-S.csv')
Quote.head()

In [None]:
Quote.shape

In [None]:
Quote.dtypes


In [None]:
#Considering only first 1000 rows
Quote = Quote.head(1000)
Quote.shape

In [None]:
import matplotlib.pyplot as plt  #IMporting Data Visualiztion Library
#This line is necessary for the plot to appear in notebook
%matplotlib inline
#Controlling Default size of figures in the notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
Quote[['time','bid_price']].plot(grid=True)
plt.title("Bid Price VS Time")
plt.xlabel('time')
plt.ylabel('Bid price')
plt.show()

In [None]:
import matplotlib.pyplot as plt  #IMporting Data Visualiztion Library
#This line is necessary for the plot to appear in notebook
%matplotlib inline
#Controlling Default size of figures in the notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
Quote[['time','ask_price']].plot(grid=True)
plt.title("Ask Price VS Time")
plt.xlabel('time')
plt.ylabel('Ask price')
plt.show()

In [None]:
import matplotlib.pyplot as plt  #IMporting Data Visualiztion Library
#This line is necessary for the plot to appear in notebook
%matplotlib inline
#Controlling Default size of figures in the notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
Quote[['bid_size','bid_price']].plot(grid=True)
plt.title("Bid Price VS bid size")
plt.xlabel('bid size')
plt.ylabel('Bid price')
plt.show()

In [None]:
import matplotlib.pyplot as plt  #IMporting Data Visualiztion Library
#This line is necessary for the plot to appear in notebook
%matplotlib inline
#Controlling Default size of figures in the notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
Quote[['ask_size','ask_price']].plot(grid=True)
plt.title("Ask Price VS ask size")
plt.xlabel('ask size')
plt.ylabel('Ask price')
plt.show()

# Stock Clustering using K-Means

In [None]:
from sklearn.cluster import KMeans   #importing the Kmeans module

#Taking only price columns
Quote_modified = Quote[['ticker','bid_price','ask_price']]
print(Quote_modified.head())
#Dropping duplicates symbol from Bar
Quote_modified.drop_duplicates(subset=['ticker'],inplace =True)

#Stting Index of dataframe to symbol
Quote_modified = Quote_modified.set_index('ticker')
print(Quote_modified.head())




In [None]:
#Storing values of price in different columns
X = Quote_modified.values
sse = []
for k in range(2,10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    
    #SSE for each cluster
    sse.append(kmeans.inertia_)
    
plt.plot(range(2,10),sse,'bx-')
plt.title('Elbow Curve')
plt.xlabel("Values of k")
plt.ylabel('Distortion')
plt.show()

**According to the Elbow Curve, we choose k = 4**

In [None]:
kmeans = KMeans(n_clusters = 4).fit(X)
centroids = kmeans.cluster_centers_
plt.scatter(X[:,0],X[:,1],c = kmeans.labels_,cmap='rainbow')
plt.title("Cluster of Dataset with n=4")
plt.show()

In [None]:
stocks = pd.DataFrame(Quote_modified.index)
cluster_labels = pd.DataFrame(kmeans.labels_)
stockClusters = pd.concat([stocks,cluster_labels],axis=1)
stockClusters.columns = ['ticker','Cluster']
print(stockClusters)
print(stockClusters.Cluster.unique()) #All the four clusters in the array


# Returns

Again importing the Bar-S file and changing index to time column 

In [None]:
#Reading the Bar-S datafile
Bar = pd.read_csv('../input/stock-market-small-wide-dataset/bar-S.csv')
Bar['time'] = Bar.time.apply(lambda x: x[:10])
#Bar=Bar.drop_duplicates(subset = ['time'],keep = 'first')

Bar_AAPL = Bar[Bar.symbol=='AAPL']
Bar_AAPL = Bar_AAPL.sort_values(by=['time'])
Bar_AAPL = Bar_AAPL.set_index(['time'])
Bar_AAPL.shape
Bar_AAPL

**Simple plot**

In [None]:
Bar_AAPL['average_price'].plot()
plt.xlabel("time")
plt.ylabel("Adjusted")
plt.title("Apple Price data")
plt.show()

# Calculating the daily returns for individual stock

In [None]:
AAPL_daily_returns = Bar_AAPL['average_price'].pct_change()
#AAPL_monthly_returns = Bar_AAPL['average_price'].resample('M').ffill().pct_change()
AAPL_daily_returns

In [None]:
fig = plt.figure()
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
ax1.plot(AAPL_daily_returns)
ax1.set_xlabel("Time")
ax1.set_ylabel("percent")
ax1.set_title("Apple Daily Returns")
plt.show()

# Calculating the cumulative returns for the Apple stock

In [None]:
AAPL_cum_returns = (AAPL_daily_returns+1).cumprod()
fig = plt.figure()
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
AAPL_cum_returns.plot()
ax1.set_xlabel("Date")
ax1.set_ylabel("Growth of $1 investment")
ax1.set_title("Apple daily cumulative returns data")
plt.show()

In [None]:
# Load the required modules and packages
import numpy as np
import pandas as pd


# Pull NIFTY data from Yahoo finance 

# Compute the logarithmic returns using the Closing price 
Bar_AAPL['Log_Ret'] = np.log(Bar_AAPL['close_price'] / Bar_AAPL['close_price'].shift(1))

# Compute Volatility using the pandas rolling standard deviation function
Bar_AAPL['Volatility'] = Bar_AAPL['Log_Ret'].rolling(window=252).std() * np.sqrt(252)


# Plot the NIFTY Price series and the Volatility
Bar_AAPL[['close_price', 'Volatility','Log_Ret']].plot(subplots=True, color='blue',figsize=(8, 6))

# Applying ARIMA model on the Dataset

**Importing the Bar Dataset again**

In [None]:
Bar = pd.read_csv('../input/stock-market-small-wide-dataset/bar-S.csv')
Bar.head()

In [None]:
#Dropping unnecessary column
Bar = Bar.drop(['epoch_time_at_the_beginning','epoch_time_at_the_ending'],axis=1)
Bar.head()

In [None]:
Bar_AAPL = Bar[Bar.symbol=='AAPL']
Bar_AAPL.shape

In [None]:
from pandas.plotting import lag_plot
from pandas import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
plt.figure(figsize=(10,10))
lag_plot(Bar_AAPL['open_price'],lag=5)
plt.title("Apple Autocorrelation plot")


Successfully i divided data into a training and test data.Once done i plot both on the same figure to get a feeling of how does our TIme Series looks like. 

In [None]:
train_data,test_data = Bar_AAPL[0:int(len(Bar_AAPL)*0.8)],Bar_AAPL[int(len(Bar_AAPL)*0.8):]
plt.figure(figsize=(12,7))
plt.title('Apple Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.plot(Bar_AAPL['open_price'],'blue',label='Training Data')
plt.plot(test_data['open_price'],'green',label='Testing Data')
plt.legend()

**In order to evaluate the ARIMA model, I decided to use two different error functions: Mean Squared Error (MSE) and Symmetric Mean Absolute Percentage Error (SMAPE). SMAPE is commonly used as an accuracy measure based on relative errors**

**SMAPE is not currently supported in Scikit-learn as a loss function I, therefore, had first to create this function on my own**

In [None]:
def smape_kun(y_true, y_pred):
    return np.mean((np.abs(y_pred - y_true) * 200/ (np.abs(y_pred) +       np.abs(y_true))))

In [None]:
train_ar = train_data['open_price'].values
test_ar = test_data['open_price'].values

history = [x for x in train_ar]
print(type(history))
predictions = list()
for t in range(len(test_ar)):
    model  = ARIMA(history,order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test_ar[t]
    history.append(obs)
error = mean_squared_error(test_ar,predictions)
print("Testing Mean Squared Error: %.3f"% error)
error2 = smape_kun(test_ar,predictions)
print("Symmetric mean absolute perecentage error: %.3f"%error2)


In [None]:
plt.figure(figsize=(12,7))
plt.plot(Bar_AAPL['open_price'], 'green', color='blue', label='Training Data')
plt.plot(test_data.index, predictions, color='green', marker='o', linestyle='dashed', 
         label='Predicted Price')
plt.plot(test_data.index, test_data['open_price'], color='red', label='Actual Price')
plt.title('Apple Prices Prediction')
plt.xlabel('Dates')
plt.ylabel('Prices')
#plt.xticks(np.arange(0,7982, 1300), df['Date'][0:7982:1300])
plt.legend()

In [None]:
plt.figure(figsize=(12,7))
plt.plot(test_data.index, predictions, color='green', marker='o', linestyle='dashed',label='Predicted Price')
plt.plot(test_data.index, test_data['open_price'], color='red', label='Actual Price')
plt.legend()
plt.title('Apple Prices Prediction')
plt.xlabel('Dates')
plt.ylabel('Prices')
#plt.xticks(np.arange(6386,7982, 300), df['Date'][6386:7982:300])
plt.legend()

**This analysis using ARIMA lead overall to appreciable results. This model demonstrated in fact to offer good prediction accuracy and to be relatively fast compared to other alternatives such as RRNs (Recurrent Neural Networks).**