# Stock Price Prediction using Machine Learning techniques

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

Nifty50 dataset from NSE - TCS, SBIN, Reliance

In [None]:
df = pd.read_csv('/kaggle/input/SBIN.csv')
df

In [None]:
df.describe()

In [None]:
df.info()

*Date* is used as the index for the dataframe. As part of cleaning, incomplete and unnecessary columns - *series, symbol, trades, deliverable volume and deliverable* - are dropped. 

In [None]:
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.set_index("Date", drop=False, inplace=True)
df.drop(["Series","Symbol","Trades","Deliverable Volume","%Deliverble"], axis=1, inplace=True)
df

In [None]:
100 * df.isnull().sum() / len(df)

In [None]:
df.VWAP.plot(figsize=(25, 5))
plt.show()

A kernel density estimate plot is a method for visualizing the distribution of observations in a dataset, analagous to a histogram. Represents the data using a continuous probability density curve in one or more dimensions.

In [None]:
sns.kdeplot(df.VWAP, shade=True)
plt.show()

In [None]:
df["month"] = df.Date.dt.month
df["week"] = df.Date.dt.isocalendar().week
df["day"] = df.Date.dt.isocalendar().day
df["day_of_week"] = df.Date.dt.dayofweek
df.drop(['Date'], axis=1, inplace=True)
df

In [None]:
indx = df.index
cols = df.columns
(indx, cols)

The data is normalized, i.e. scaled to [0, 1] range

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(df)
data = pd.DataFrame(data, index=indx, columns=cols)
df = data
df

*Close* is selected for prediction

In [None]:
# y = df['VWAP']
# y.index = indx
# df.drop(['VWAP'], axis=1, inplace=True)
y = df['Close']
y.index = indx
df.drop(['VWAP', 'Close'], axis=1, inplace=True)
df

In [None]:
X = df
X

In [None]:
len(X)

In [None]:
cut = int(len(X)*0.8)

X_train = X[:cut]
X_test = X[cut:]
y_train = y[:cut]
y_test = y[cut:]
(X_test, y_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

## kNN regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)
y_pred

In [None]:
sns.kdeplot(y_test, shade=True)
sns.kdeplot(y_pred, shade=True)

In [None]:
y_test

In [None]:
np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(y)
plt.plot(y_test.index, y_pred)
plt.show()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
euc_l = euclidean_distances(X_test, X_train)
euc_l

In [None]:
euclidean_distances(X_test[1:2], X_train[2:3])[0][0]

In [None]:
print(X_test.shape)
print(X_train.shape)
print(euc_l.shape)

# kNN implementation

In [None]:
def get_val(x_train, test_r, y_test, n_neighbors):
    distances = []
    for i in range(len(x_train)):
        dist = euclidean_distances([test_r], [x_train[i]] )[0][0]
        distances.append((i, dist))
    distances.sort(key=lambda tup: tup[1])
    v = 0
    for i in range(n_neighbors):
        v += y_test[distances[i][0]]
    return v/n_neighbors

In [None]:
y_pred = []
for i in X_test.values:
    y_pred.append(get_val(X_train.values, i, y_train, 2))
y_pred

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(y)
plt.plot(y_test.index, y_pred)
plt.show()

In [None]:
np.sqrt(np.mean(np.power((np.array(y_test)-np.array(y_pred)),2)))