In [None]:
!pip install pmdarima

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from fbprophet import Prophet
import pmdarima as pmd

# Stock Prediction using Machine Learning techniques
## Dataset - Nifty50 SBIN

In [None]:
df = pd.read_csv('../input/nifty50-stock-market-data/SBIN.csv')
df

In [None]:
df.describe()

In [None]:
df.info()

Date is used as the index for the dataframe. As part of cleaning, incomplete and unnecessary columns - series, symbol, trades, deliverable volume and deliverable - are dropped. 

In [None]:
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.set_index("Date", drop=False, inplace=True)
df.drop(["Series","Symbol","Trades","Deliverable Volume","%Deliverble"], axis=1, inplace=True)
df

In [None]:
100 * df.isnull().sum() / len(df)

In [None]:
df.VWAP.plot(figsize=(25, 5))
plt.show()

A kernel density estimate plot is a method for visualizing the distribution of observations in a dataset, analagous to a histogram. Represents the data using a continuous probability density curve in one or more dimensions.

In [None]:
sns.kdeplot(df.VWAP, shade=True)
plt.show()

In [None]:
df["month"] = df.Date.dt.month
df["week"] = df.Date.dt.isocalendar().week
df["day"] = df.Date.dt.isocalendar().day
df["day_of_week"] = df.Date.dt.dayofweek
df.drop(['Date'], axis=1, inplace=True)
df

In [None]:
y = df['Close']
df.drop(['VWAP', 'Close'], axis=1, inplace=True)
df

In [None]:
indx = df.index
cols = df.columns
(indx, cols)

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(df)
X = pd.DataFrame(data, index=indx, columns=cols)
X

RMSE is considered as the evaluation metric

In [None]:
def get_rmse(y_test, y_pred): return np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))

# k-Nearest Neighbours algorithm

In [None]:
def get_val(x_train, test_r, y_test, n_neighbors):
    distances = []
    for i in range(len(x_train)):
        dist = euclidean_distances([test_r], [x_train[i]])[0][0]
        distances.append((i, dist))
    distances.sort(key=lambda tup: tup[1])
    v = 0
    for i in range(n_neighbors):
        v += y_test[distances[i][0]]
    return v/n_neighbors

### Train and test without k-Fold

In [None]:
cut = int(len(X)*0.8)

X_train = X[:cut]
X_test = X[cut:]
y_train = y[:cut]
y_test = y[cut:]

In [None]:
y_pred = []
for i in X_test.values:
    y_pred.append(get_val(X_train.values, i, y_train, 2))
y_pred

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(y)
plt.plot(y_test.index, y_pred)
plt.show()
rmse_knn_fl = np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))
print("RMS value -", rmse_knn_fl)

### Train and test with 10 folds (Folds created using TimeSeriesSplit)

In [None]:
ts = TimeSeriesSplit(n_splits=10)

In [None]:
y_preds = []
rmses = []
for train_index,test_index in ts.split(X):
    X_train,X_test = X.values[train_index],X.values[test_index]
    y_train,y_test = y.values[train_index],y.values[test_index]
    y_pred = []
    for row in X_test:
        y_pred.append(get_val(X_train, row, y_train, 2))
    plt.figure(figsize=(20, 5))
    plt.plot(indx, y.values)
    plt.plot(indx[test_index], y_pred)
    plt.show()
    rmse = np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))
    rmses.append(rmse)
    print("RMSE value -", rmse)

In [None]:
plt.bar(range(10), rmses)
plt.show()
rmse_knn_avg = np.mean(rmses)
print(rmse_knn_avg)

In [None]:
fn_ts = TimeSeriesSplit(n_splits=10)

In [None]:
fn_y_preds = []
fn_rmses = []
knn = KNeighborsRegressor(n_neighbors=2)
for train_index,test_index in fn_ts.split(X):
    X_train,X_test = X.values[train_index],X.values[test_index]
    y_train,y_test = y.values[train_index],y.values[test_index]
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    fn_y_preds.append(y_pred)
    plt.figure(figsize=(20, 5))
    plt.plot(indx, y.values)
    plt.plot(indx[test_index], y_pred)
    plt.show()
    rmse = np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))
    fn_rmses.append(rmse)
    print("RMSE - ", rmse)

In [None]:
plt.bar(range(10), fn_rmses)
plt.show()
print(np.mean(fn_rmses))

# Linear Regression with Principal Component Analysis

In [None]:
pca = PCA(n_components=2)
X_p = pca.fit_transform(df)
X_p

In [None]:
X_p = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_p)
X_p

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)
y_pred_lr = model.predict(X_test)
y_pred_lr

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(y)
plt.plot(y_test.index, y_pred_lr)
plt.show()
rmse_lr_fl = np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred_lr)),2)))
print("RMSE value - ", rmse_lr_fl)

In [None]:
rmses_lr =[]
for train_index,test_index in ts.split(X):
    X_train,X_test = X.values[train_index],X.values[test_index]
    y_train,y_test = y.values[train_index],y.values[test_index]
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = get_rmse(y_pred, y_test)
    rmses_lr.append(rmse)
    plt.figure(figsize=(20, 5))
    plt.plot(y)
    plt.plot(indx[test_index], y_pred)
    plt.show()
    print("RMSE value -", rmse)

In [None]:
rmse_lr_avg = np.mean(rmses_lr)
plt.bar(range(10), rmses_lr)
plt.show()
print("RMSE value - ", rmse_lr_avg)

# FbProphet for timeseries prediction

In [None]:
X_pr = pd.DataFrame(y.values, columns=['Close'])
X_pr['Date'] = y.index
X_train = X_pr[:cut]
X_test = X_pr[cut:]

In [None]:
model = Prophet()
model.fit(X_train[["Date", "Close"]].rename(columns={"Date": "ds", "Close": "y"}))

In [None]:
forecast = model.predict(X_test[["Date", "Close"]].rename(columns={"Date": "ds"}))
forecast

In [None]:
forecast.yhat

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(X_pr['Close'])
plt.plot(X_test.index, forecast.yhat)
plt.show()
rmse_pr_fl = get_rmse(X_test["Close"], forecast.yhat)
print("RMSE value - ", rmse_pr_fl)

# ARIMA

In [None]:
X_train_ar = y[:cut]
X_test_ar = y[cut:]

In [None]:
arima_model = pmd.auto_arima(X_train_ar)
arima_model.summary()

In [None]:
y_pred_ar = arima_model.predict(X_test_ar.shape[0])
y_pred_ar

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(y)
plt.plot(X_test_ar.index, y_pred_ar)
plt.show()
rmse_arima_fl = get_rmse(X_test_ar, y_pred_ar)
print("RMSE value - ", rmse_arima_fl)

In [None]:
plt.figure(figsize=(15, 10))
plt.bar(["kNN", "Linear Regression", "Prophet", "ARIMA"], [rmse_knn_fl, rmse_lr_fl, rmse_pr_fl, rmse_arima_fl])
plt.title("RMSE comparison among different algorithms")
plt.show()