# Playing with Data

In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [43]:
#For Quandl installation 
!pip install Quandl



In [44]:
# important quandl to access dataset
import quandl

In [None]:
# loading data from quandl
data = quandl.get("NSE/TATAGLOBAL")

In [None]:
# checking how it looks
data.head()

In [None]:
# checking the size of data
data.shape

In [None]:
# basic metrics to quantify
data.describe()

Here, we are trying to predict the 'Close' value - Regression problem
After that, we are looking at working with buy and sell option - Classification problem (figuring the next step to be taken)

In [None]:
# visualizing how the data is turning out to be
plt.figure(figsize=(16,8))
plt.plot(data['Close'], label = "Closing price")

In [None]:
# creating new columns
data["Open - Close"] = data['Open'] - data['Close']
data["High - Low"] = data['High'] - data['Low']

# dropping null values
data.dropna()

In [None]:
# Checking for possible categories, none that are obvious
data.nunique()

## CLASSIFICATION PROBLEM HERE
Buy (+1) / Sell (-1)

In [None]:
X = data[['Open - Close','High - Low']]

In [None]:
# shift() deals with moving rows up and down
# Here, if the stock at present day closes at a greater value than the previous day, it's more expensive
# Therefore, that label is now 1 and otherwise -1
Y = np.where(data['Close'].shift(-1) > data['Close'], 1, -1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25, random_state = 143)

### Implementing the KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# finding best parameters using GridSearch
parameters = {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15]}
knn = neighbors.KNeighborsClassifier()
model = GridSearchCV(knn,parameters,cv=5)

In [None]:
# fitting the model
model.fit(X_train,Y_train)

In [None]:
# getting to predictions
pred1 = model.predict(X_train)
accuracy_score(pred1,Y_train)

In [None]:
pred2 = model.predict(X_test)
accuracy_score(pred2,Y_test)

In [None]:
Low Accuracies :(

## Implementing KNeighbors Regressor Model

In [None]:
# back to importing packages
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors

In [None]:
X_train_reg, X_test_reg, Y_train_reg, Y_test_reg = train_test_split(X,Y,test_size = 0.25,random_state = 56)

In [None]:
# finding best parameters using GridSearch
parameters = {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15]}
knn_reg = neighbors.KNeighborsRegressor()
model_reg = GridSearchCV(knn_reg,parameters,cv=5)

In [None]:
model_reg.fit(X_train_reg,Y_train_reg)
pred_reg = model_reg.predict(X_test_reg)

In [None]:
# Checking out the root-mean-square-error
rms = np.sqrt(np.mean(np.power((np.array(Y_test)-np.array(pred_reg)),2)))
rms