In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import plot

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
# reading the dataset

TCS = pd.read_csv('/Users/apple/Downloads/TCS.NSE.csv')

In [3]:
# head of dataset

TCS.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-01,1283.5,1283.5,1270.5,1272.775024,1114.909302,366830.0
1,2015-01-02,1275.5,1295.474976,1275.300049,1289.724976,1129.757202,925740.0
2,2015-01-05,1290.5,1299.949951,1262.324951,1270.125,1112.587769,1754242.0
3,2015-01-06,1264.550049,1264.550049,1220.0,1223.300049,1071.570801,2423784.0
4,2015-01-07,1235.0,1239.574951,1203.724976,1208.849976,1058.912842,2636332.0


In [4]:
print(TCS['Volume'])

0        366830.0
1        925740.0
2       1754242.0
3       2423784.0
4       2636332.0
          ...    
1661    2320754.0
1662    1673362.0
1663    2253075.0
1664    2489161.0
1665    2252412.0
Name: Volume, Length: 1666, dtype: float64


In [5]:
TCS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1666 non-null   object 
 1   Open       1663 non-null   float64
 2   High       1663 non-null   float64
 3   Low        1663 non-null   float64
 4   Close      1663 non-null   float64
 5   Adj Close  1663 non-null   float64
 6   Volume     1663 non-null   float64
dtypes: float64(6), object(1)
memory usage: 91.2+ KB


In [6]:
TCS.isnull().sum()

Date         0
Open         3
High         3
Low          3
Close        3
Adj Close    3
Volume       3
dtype: int64

In [7]:
TCS['Volume'].describe()

count    1.663000e+03
mean     2.871305e+06
std      2.752632e+06
min      8.682200e+04
25%      1.738917e+06
50%      2.404448e+06
75%      3.328333e+06
max      8.806715e+07
Name: Volume, dtype: float64

In [8]:
# filling the null values

TCS['Volume'].fillna(TCS['Volume'].mean(),inplace=True)
TCS['Open'].fillna(TCS['Volume'].mean(),inplace=True)
TCS['High'].fillna(TCS['High'].mean(),inplace=True)
TCS['Low'].fillna(TCS['Low'].mean(),inplace=True)
TCS['Close'].fillna(TCS['Close'].mean(),inplace=True)
TCS['Adj Close'].fillna(TCS['Adj Close'].mean(),inplace=True)

In [9]:
TCS.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [10]:
# changing the datatype of volume column

TCS['Volume'] = TCS['Volume'].astype('int64')

In [11]:
TCS.Volume.dtypes

dtype('int64')

In [12]:
# converting Date column to Datetime

TCS['Date'] = pd.to_datetime(TCS['Date'])

In [13]:
print(f'Dataframe contains stock prices between {TCS.Date.min()} {TCS.Date.max()}') 
print(f'Total days = {(TCS.Date.max()  - TCS.Date.min()).days} days')

Dataframe contains stock prices between 2015-01-01 00:00:00 2021-09-30 00:00:00
Total days = 2464 days


In [14]:
# Describe method

TCS.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1666.0,1666.0,1666.0,1666.0,1666.0,1666.0
mean,6993.156,1845.240901,1805.624554,1825.2264,1733.238404,2871305.0
std,121694.8,685.696616,671.497093,678.378615,714.057642,2750151.0
min,1058.45,1070.0,1025.949951,1050.574951,955.857666,86822.0
25%,1256.0,1265.0,1243.018738,1254.812531,1129.929931,1741054.0
50%,1726.537,1753.200012,1705.200012,1725.862488,1623.03064,2406452.0
75%,2171.938,2189.400086,2145.012574,2164.487488,2078.331482,3324693.0
max,2871305.0,3981.75,3892.100098,3954.550049,3954.550049,88067150.0


In [15]:
# Setting the layout for our plot

layout = go.Layout(
    title='Stock Prices of TCS',
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

TCS_data = [{'x':TCS['Date'], 'y':TCS['Close']}]
plot = go.Figure(data=TCS_data, layout=layout)

In [16]:
iplot(plot)

In [17]:
# Building regression model

from sklearn.model_selection import train_test_split

# For preprocessing

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# For model evaluation

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

In [18]:
# split the data into training and testing 

X = np.array(TCS.index).reshape(-1,1)
y = TCS['Close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [19]:
# Feature scaling

scaler = StandardScaler().fit(X_train)

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
# Creating a linear model

lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [22]:
#Plot actual and predicted values for train dataset

trace0 = go.Scatter(
    x = X_train.T[0],
    y = y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = lm.predict(X_train).T,
    mode = 'lines',
    name = 'Predicted'
)
TCS_data = [trace0,trace1]
layout.xaxis.title.text = 'Day'
plot2 = go.Figure(data=TCS_data, layout=layout)

In [23]:
iplot(plot2)