<a href="https://colab.research.google.com/github/stacykeago/KPMG/blob/main/KPMG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Define the question
There has been an increase of more than 10 per cent in the stock price of Tata Motors. This has resulted in more attention to Tata Group stocks from all over India. But again today, we are witnessing a fall in the prices of Tata Motors’ shares, which can be a negative signal for investors.  we want to learn how to analyze and predict the Tata Motors stock price

2. Metric for success
Making correct predictions on Tata Motors stock price with the model with the lowest mean squared error or the model with highest accuracy.

3. Experimental design taken
Import libraries
Load  dataset
clean the data 
Exploratory data analysis techniques

4. Appropriateness of available data to answer the given question. To get the data:
        *   Visit Yahoo Finance
        *   Search for Tata Motors or TTM (it’s the stock symbol of Tata Motors)
        *   Then click on Historical data and click on download.         


# DATA CLEANING

## Import libraries

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt       # for data visualization
import seaborn as sns                       # for statistical data visualization
%matplotlib inline

## Load Datasets

In [None]:
ttm = pd.read_csv('/TTM.csv')
ttm

In [None]:
# Rows and columns
ttm.shape

## Dataset summary info

In [None]:
#Check for dataset info
ttm.info()

## Numeric Features Description

In [None]:
# Count, mean, standard deviation, minimum, maximum,
ttm.describe().T

# Preprocessing data

### Column list

In [None]:
ttm.columns

###  Check for duplicates

In [None]:
ttm.duplicated().any()

### Check for unique

In [None]:
# Check for unique values in each column
ttm.nunique()

### Data types in each column

In [None]:
# Check for data types in each column
ttm.dtypes

#### # Convert Date column to data type " Date" format

In [None]:
# Convert Date column to data type " Date"
ttm['Date'] = pd.to_datetime(ttm.Date)
ttm.head()

### Check for null values

In [None]:
ttm.isnull().sum()

### Check for NA Values

In [None]:
print("NA values:", ttm.isna().values.any())

## Sorting dataset by date format

In [None]:
ttm.sort_values(by='Date', inplace=True)
ttm.head()

### Outliers

In [None]:
columns_dict ={ 'Open':1, 'High':2, 'Low':3, 'Close':4, 'Adj Close':5, 'Volume':6}
plt.figure(figsize=(20,30))

# make a boxplot for each numerical column
for variable,i in columns_dict.items():
  plt.subplot(5,4,i)
  plt.boxplot(ttm[variable])
  plt.title(variable)

plt.show()

# EXPLORATORY DATA ANALYSIS

## Monthwise comparision between Stock open and close price

In [None]:
monthwise= ttm.groupby(ttm['Date'].dt.strftime('%B'))[['Open','Close']].mean()
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
monthwise = monthwise.reindex(new_order, axis=0)
monthwise

## Plot Monthwise comparision between Stock open and close price

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=monthwise.index,
    y=monthwise['Open'],
    name='Stock Open Price',
    marker_color='crimson'))

fig.add_trace(go.Bar(
    x=monthwise.index,
    y=monthwise['Close'],
    name='Stock Close Price',
    marker_color='lightsalmon'))

fig.update_layout(barmode='group', xaxis_tickangle=-45, 
                  title='Monthwise comparision between Stock open and close price')
fig.show()

## Monthwise High and Low stock price

In [None]:
# Monthwise High and Low stock price
ttm.groupby(ttm['Date'].dt.strftime('%B'))['Low'].min()

## Plot Monthwise High and Low stock price

In [None]:
ttm.groupby(ttm['Date'].dt.strftime('%B'))['Low'].min()
monthwise_high = ttm.groupby(ttm['Date'].dt.strftime('%B'))['High'].max()
monthwise_high = monthwise_high.reindex(new_order, axis=0)

monthwise_low = ttm.groupby(ttm['Date'].dt.strftime('%B'))['Low'].min()
monthwise_low = monthwise_low.reindex(new_order, axis=0)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=monthwise_high.index,
    y=monthwise_high,
    name='Stock high Price',
    marker_color='rgb(0, 153, 204)'))

fig.add_trace(go.Bar(
    x=monthwise_low.index,
    y=monthwise_low,
    name='Stock low Price',
    marker_color='rgb(255, 128, 0)'))

## Trend comparision between stock open price, close price, high price, low price

In [None]:
from itertools import cycle
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
names = cycle(['Stock Open Price','Stock Close Price','Stock High Price','Stock Low Price'])

fig = px.line(ttm, x=ttm.Date, y=[ttm['Open'], ttm['Close'], 
                                          ttm['High'], ttm['Low']],
             labels={'Date': 'Date','value':'Stock value'})
fig.update_layout(title_text='Stock analysis chart', font_size=15, font_color='black',legend_title_text='Stock Parameters')
fig.for_each_trace(lambda t:  t.update(name = next(names)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.show()



###  Close price prediction preparation and preprocessing

In [None]:
# Make separate dataframe of close price

closedf = ttm[['Date','Close']]
print("Shape of close dataframe:", closedf.shape)

#### Plotting Stock Close price chart

In [None]:
# Plotting Stock Close price chart
fig = px.line(closedf, x=closedf.Date, y=closedf.Close,labels={'Date':'Date','Close':'Close Stock'})
fig.update_traces(marker_line_width=2, opacity=0.8)
fig.update_layout(title_text='Stock close price chart', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

## Print the duration of the dataset

In [None]:
print("Starting date: ",ttm.iloc[0][0])
print("Ending date: ", ttm.iloc[-1][0])
print("Duration: ", ttm.iloc[-1][0]-ttm.iloc[0][0])

## Consider the last 6 months

In [None]:
closedf = closedf[closedf['Date'] > '2021-12-01']
close_stock = closedf.copy()
print("Total data for prediction: ",closedf.shape[0])

In [None]:
fig = px.line(closedf, x=closedf.Date, y=closedf.Close,labels={'Date':'Date','Close':'Close Stock'})
fig.update_traces(marker_line_width=2, opacity=0.8, marker_line_color='orange')
fig.update_layout(title_text='Considered last 6months period to predict Stock close price', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

# UNIVARIATE ANALYSIS

In [None]:
ttm[['Close']].plot()
plt.title("Tata Motors Stock prediction")
plt.show()

In [None]:
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

In [None]:
plt.figure(figsize=(10,10))
lag_plot(ttm.Open, lag=5)
plt.title("Tata Autocorrelation plot")


In [None]:
# Univariate analysis
sns.displot(ttm.Open.dropna(), kde=False, bins = 39);
sns.displot(ttm.Close.dropna(), kde=False, bins = 39);
sns.displot(ttm.Volume.dropna(), kde=False, bins = 39);
sns.displot(ttm.High.dropna(), kde=False, bins = 39);
sns.displot(ttm.Low.dropna(), kde=False, bins = 39);


## Pair plot

In [None]:
sns.set()
columns1 = [ 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
sns.pairplot(ttm[columns1],height = 2 ,kind ='scatter',diag_kind='kde')
plt.show()

# BIVARIATE ANALYSIS

In [None]:
# Heatmap
plt.figure(figsize=(30, 12))
vg_corr = ttm.corr()
sns.heatmap(vg_corr, 
            xticklabels = vg_corr.columns.values,
            yticklabels = vg_corr.columns.values,
            annot = True);

In [None]:
columns_dict ={ 'Open':1, 'High':2, 'Low':3, 'Close':4, 'Adj Close':5, 'Volume':6}

# Ploting the univariate summaries and recording our observations
# plot histograms for all numeric columns
# we use the dictionary we had earlier on saved

plt.figure(figsize = (30,50))
for variable,i in columns_dict.items():
  plt.subplot(5,4,i)
  sns.displot(ttm[variable])
  plt.title(variable)
  plt.show()

### Data Correlation

In [None]:
print(ttm.corr())

### Price prediction

In [None]:
!pip install autots
# Using the autots library in Python to prepare the stock prices of Tata Motors for the next 5 days

In [None]:
#from autots import AutoTS
#model = AutoTS(forecast_length=5, frequency='infer', ensemble='simple')
#model = model.fit(ttm, date_col='Date', value_col='Close', id_col=None)
#prediction = model.predict()
#forecast = prediction.forecast
#print(forecast)

# RANDOM FOREST REGRESSION

### Normalizing / scaling close value between 0 to 1

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
close_stock = closedf.copy()
del closedf['Date']
scaler=MinMaxScaler(feature_range=(0,1))
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)

### Split data for training and testing

In [None]:
training_size=int(len(closedf)*0.65)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
print("train_Data: ", train_data.shape)
print("test_Data: ", test_data.shape)

In [None]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

# KNN

In [None]:
from sklearn import neighbors

K = time_step
neighbor = neighbors.KNeighborsRegressor(n_neighbors = K)
neighbor.fit(X_train, y_train)

In [None]:
# Lets Do the prediction 

train_predict=neighbor.predict(X_train)
test_predict=neighbor.predict(X_test)

train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)

print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

In [None]:
# Transform back to original form

train_predict = scaler.inverse_transform(train_predict) 
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 

In [None]:
import math

In [None]:
# Evaluation metrices RMSE and MAE
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
print("Test data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))

#CONCLUSION

In [None]:
#Explained variance regression score
#The explained variance score explains the dispersion of errors of a given dataset, and the formula is written as follows: 
#Here, and Var(y) is the variance of prediction errors and actual values respectively. Scores close to 1.0 are highly desired,
 #indicating better squares of standard deviations of errors.

In [None]:
print("Train data explained variance regression score:", explained_variance_score(original_ytrain, train_predict))
print("Test data explained variance regression score:", explained_variance_score(original_ytest, test_predict))

In [None]:
# R2 score for regression
# R-squared (R2) is a statistical measure that represents the proportion of the
 # variance for a dependent variable that's explained by an independent variable or variables in a regression model.
 # 1 = Best
# 0 or < 0 = worse

print("Train data R2 score:", r2_score(original_ytrain, train_predict))
print("Test data R2 score:", r2_score(original_ytest, test_predict))

In [None]:
x_input=test_data[len(test_data)-time_step:].reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = 10
while(i<pred_days):
    
    if(len(temp_input)>time_step):
        
        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input=x_input.reshape(1,-1)
        
        yhat = neighbor.predict(x_input)
        #print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat.tolist())
        temp_input=temp_input[1:]
       
        lst_output.extend(yhat.tolist())
        i=i+1
        
    else:
        yhat = neighbor.predict(x_input)
        
        temp_input.extend(yhat.tolist())
        lst_output.extend(yhat.tolist())
        
        i=i+1
        
print("Output of predicted next days: ", len(lst_output))

### Plotting last 15 days and next predicted 10 days

In [None]:
last_days=np.arange(1,time_step+1)
day_pred=np.arange(time_step+1,time_step+pred_days+1)
print(last_days)
print(day_pred)

In [None]:
temp_mat = np.empty((len(last_days)+pred_days+1,1))
temp_mat[:] = np.nan
temp_mat = temp_mat.reshape(1,-1).tolist()[0]

last_original_days_value = temp_mat
next_predicted_days_value = temp_mat

last_original_days_value[0:time_step+1] = scaler.inverse_transform(closedf[len(closedf)-time_step:]).reshape(1,-1).tolist()[0]
next_predicted_days_value[time_step+1:] = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]

new_pred_plot = pd.DataFrame({
    'last_original_days_value':last_original_days_value,
    'next_predicted_days_value':next_predicted_days_value
})

names = cycle(['Last 15 days close price','Predicted next 10 days close price'])
fig = px.line(new_pred_plot,x=new_pred_plot.index, y=[new_pred_plot['last_original_days_value'],
                                                      new_pred_plot['next_predicted_days_value']],
              labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Compare last 15 days vs next 10 days',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [None]:
knndf=closedf.tolist()
knndf.extend((np.array(lst_output).reshape(-1,1)).tolist())
knndf=scaler.inverse_transform(knndf).reshape(1,-1).tolist()[0]

names = cycle(['Close price'])

fig = px.line(knndf,labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Plotting whole closing stock price with prediction',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Stock')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()