# Name : Shanthan Rao

# Task 7 : Stock Market Prediction 

In [1]:
# importing libraries
import warnings
warnings.filterwarnings('ignore')
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, metrics
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Dense, Activation
import nltk
nltk.download('vader_lexicon')
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# reading the datasets into pandas
stock_price = pd.read_csv('BSESN.csv')

In [3]:
stock_headlines = pd.read_csv('india-news-headlines.csv')

In [4]:
stock_price.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj. close**,Volume
0,7-May-21,49169.14,49417.64,49036.38,49206.47,49206.47,-
1,6-May-21,48877.78,49011.31,48614.11,48949.76,48949.76,9500
2,5-May-21,48569.12,48742.72,48254.32,48677.55,48677.55,12700
3,4-May-21,48881.63,48996.53,48149.45,48253.51,48253.51,13200
4,3-May-21,48356.01,48863.23,48028.07,48718.52,48718.52,14700


In [5]:
len(stock_price), len(stock_headlines)

(100, 3424067)

In [6]:
stock_price.isna().any(), stock_headlines.isna().any()

(Date            False
 Open            False
 High            False
 Low             False
 Close*          False
 Adj. close**    False
 Volume          False
 dtype: bool,
 publish_date         False
 headline_category    False
 headline_text        False
 dtype: bool)

In [7]:
#dropping any null entries in our data
stock_price.dropna(axis=0, inplace=True)

In [8]:
# dropping duplicates in data
stock_price = stock_price.drop_duplicates()

# coverting the datatype of column 'Date' from type object to type 'datetime'
stock_price['Date'] = pd.to_datetime(stock_price['Date']).dt.normalize()

# filtering the important columns
stock_price = stock_price.filter(['Date', 'Close', 'Open', 'High', 'Low', 'Volume'])

# setting column 'Date' as the index column
stock_price.set_index('Date', inplace= True)

# sorting the data according to Date
stock_price = stock_price.sort_index(ascending=True, axis=0)
stock_price

Unnamed: 0_level_0,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-10,45999.42,46043.97,45685.87,12700
2020-12-11,46060.32,46309.63,45706.22,26300
2020-12-14,46284.70,46373.34,45951.53,18400
2020-12-15,46287.39,46350.30,45841.67,16400
2020-12-16,46573.31,46704.97,46402.20,13100
...,...,...,...,...
2021-05-03,48356.01,48863.23,48028.07,14700
2021-05-04,48881.63,48996.53,48149.45,13200
2021-05-05,48569.12,48742.72,48254.32,12700
2021-05-06,48877.78,49011.31,48614.11,9500


In [9]:
# dropping duplicates
stock_headlines = stock_headlines.drop_duplicates()

# coverting the datatype of column 'Date' from type string to type 'datetime'
stock_headlines['publish_date'] = stock_headlines['publish_date'].astype(str)
stock_headlines['publish_date'] = stock_headlines['publish_date'].apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:8])
stock_headlines['publish_date'] = pd.to_datetime(stock_headlines['publish_date']).dt.normalize()

# filtering the important columns 
stock_headlines = stock_headlines.filter(['publish_date', 'headline_text'])

# grouping the news headlines according to Date
stock_headlines = stock_headlines.groupby(['publish_date'])['headline_text'].apply(lambda x: ','.join(x)).reset_index()

# setting Date as the index column
stock_headlines.set_index('publish_date', inplace= True)

# sorting the data according to the Date
stock_headlines = stock_headlines.sort_index(ascending=True, axis=0)
stock_headlines

Unnamed: 0_level_0,headline_text
publish_date,Unnamed: 1_level_1
2001-01-02,Status quo will not be disturbed at Ayodhya; s...
2001-01-03,"Powerless north India gropes in the dark,Think..."
2001-01-04,The string that pulled Stephen Hawking to Indi...
2001-01-05,Light combat craft takes India into club class...
2001-01-06,Light combat craft takes India into club class...
...,...
2020-12-27,#BigInterview! Dhritiman Chatterjee: Nobody da...
2020-12-28,Horoscope Today; 28 December 2020: Check astro...
2020-12-29,Man recovers charred remains of 'thief' from h...
2020-12-30,Numerology Readings 30 December 2020: Predicti...


In [10]:
# concatenation of the datasets stock_price and stock_headlines
stock_data = pd.concat([stock_price, stock_headlines], axis=1)

stock_data.dropna(axis=0, inplace=True)

# displaying the combined stock_data
stock_data

Unnamed: 0,Open,High,Low,Volume,headline_text
2020-12-10,45999.42,46043.97,45685.87,12700,"Gold imports plunge; demand dips,Horoscope Tod..."
2020-12-11,46060.32,46309.63,45706.22,26300,Ways to get you in the mood for sex when you'r...
2020-12-14,46284.7,46373.34,45951.53,18400,"No threat from Covid claims to balance sheets,..."
2020-12-15,46287.39,46350.3,45841.67,16400,1;147 RT-PCR tests in a day; only 3 found +ve ...
2020-12-16,46573.31,46704.97,46402.2,13100,Asmita Sood's favourite holiday destinations a...
2020-12-17,46774.32,46992.57,46627.6,17200,Horoscope Today; 17 December 2020: Check astro...
2020-12-18,47026.02,47026.02,46630.31,12800,Cops; panch witnesses sniffed suspects' mouths...
2020-12-21,46932.18,47055.69,44923.08,24400,Horoscope Today; 21 December 2020: Check astro...
2020-12-22,45529.61,46080.18,45112.19,25100,Addite and Mohit Malik: We are looking forward...
2020-12-23,46072.3,46513.32,45899.1,10500,"Sawmill in Makarpura GIDC gutted; none hurt,Ci..."


In [11]:
#Sentiment Analysis
stock_data['compound'] = ''
stock_data['negative'] = ''
stock_data['neutral'] = ''
stock_data['positive'] = ''
stock_data.head()

Unnamed: 0,Open,High,Low,Volume,headline_text,compound,negative,neutral,positive
2020-12-10,45999.42,46043.97,45685.87,12700,"Gold imports plunge; demand dips,Horoscope Tod...",,,,
2020-12-11,46060.32,46309.63,45706.22,26300,Ways to get you in the mood for sex when you'r...,,,,
2020-12-14,46284.7,46373.34,45951.53,18400,"No threat from Covid claims to balance sheets,...",,,,
2020-12-15,46287.39,46350.3,45841.67,16400,1;147 RT-PCR tests in a day; only 3 found +ve ...,,,,
2020-12-16,46573.31,46704.97,46402.2,13100,Asmita Sood's favourite holiday destinations a...,,,,


In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

#Sentiment Analyzer
sid = SentimentIntensityAnalyzer()
print('Start calculating sentiment scores:')

stock_data['compound'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
print('Compound Done')
stock_data['negative'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['neg'])
print('Negative Done')
stock_data['neutral'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['neu'])
print('Neutral Done')
stock_data['positive'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['pos'])
print('Positive Done')
print('Stop')

stock_data.head()

Start calculating sentiment scores:
Compound Done
Negative Done
Neutral Done
Positive Done
Stop


Unnamed: 0,Open,High,Low,Volume,headline_text,compound,negative,neutral,positive
2020-12-10,45999.42,46043.97,45685.87,12700,"Gold imports plunge; demand dips,Horoscope Tod...",-0.9996,0.12,0.802,0.078
2020-12-11,46060.32,46309.63,45706.22,26300,Ways to get you in the mood for sex when you'r...,-0.9999,0.147,0.772,0.081
2020-12-14,46284.7,46373.34,45951.53,18400,"No threat from Covid claims to balance sheets,...",-0.9993,0.112,0.808,0.081
2020-12-15,46287.39,46350.3,45841.67,16400,1;147 RT-PCR tests in a day; only 3 found +ve ...,-0.9998,0.123,0.802,0.074
2020-12-16,46573.31,46704.97,46402.2,13100,Asmita Sood's favourite holiday destinations a...,-0.9994,0.105,0.82,0.075


In [13]:
stock_data.isna().any()

Open             False
High             False
Low              False
Volume           False
headline_text    False
compound         False
negative         False
neutral          False
positive         False
dtype: bool

In [14]:
stock_data.describe(include='all')

Unnamed: 0,Open,High,Low,Volume,headline_text,compound,negative,neutral,positive
count,15.0,15.0,15.0,15.0,15,15.0,15.0,15.0,15.0
unique,15.0,15.0,15.0,14.0,15,,,,
top,47466.62,46704.97,46539.02,12800.0,Asmita Sood's favourite holiday destinations a...,,,,
freq,1.0,1.0,1.0,2.0,1,,,,
mean,,,,,,-0.9995,0.122467,0.797067,0.080467
std,,,,,,0.000687,0.018146,0.026628,0.009694
min,,,,,,-0.9999,0.082,0.742,0.064
25%,,,,,,-0.9998,0.116,0.7815,0.0755
50%,,,,,,-0.9997,0.123,0.8,0.08
75%,,,,,,-0.99955,0.129,0.805,0.0865


In [15]:

# displaying stock_data information
stock_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15 entries, 2020-12-10 to 2020-12-31
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           15 non-null     object 
 1   High           15 non-null     object 
 2   Low            15 non-null     object 
 3   Volume         15 non-null     object 
 4   headline_text  15 non-null     object 
 5   compound       15 non-null     float64
 6   negative       15 non-null     float64
 7   neutral        15 non-null     float64
 8   positive       15 non-null     float64
dtypes: float64(4), object(5)
memory usage: 1.2+ KB


In [16]:

# calculating 7 day rolling mean
stock_data.rolling(7).mean().head(20)

Unnamed: 0,compound,negative,neutral,positive
2020-12-10,,,,
2020-12-11,,,,
2020-12-14,,,,
2020-12-15,,,,
2020-12-16,,,,
2020-12-17,,,,
2020-12-18,-0.999643,0.123143,0.797857,0.079
2020-12-21,-0.999629,0.120714,0.802143,0.077143
2020-12-22,-0.999629,0.122286,0.797857,0.079714
2020-12-23,-0.999671,0.123429,0.796714,0.079571


In [17]:
# displaying stock_data
stock_data

Unnamed: 0,Open,High,Low,Volume,headline_text,compound,negative,neutral,positive
2020-12-10,45999.42,46043.97,45685.87,12700,"Gold imports plunge; demand dips,Horoscope Tod...",-0.9996,0.12,0.802,0.078
2020-12-11,46060.32,46309.63,45706.22,26300,Ways to get you in the mood for sex when you'r...,-0.9999,0.147,0.772,0.081
2020-12-14,46284.7,46373.34,45951.53,18400,"No threat from Covid claims to balance sheets,...",-0.9993,0.112,0.808,0.081
2020-12-15,46287.39,46350.3,45841.67,16400,1;147 RT-PCR tests in a day; only 3 found +ve ...,-0.9998,0.123,0.802,0.074
2020-12-16,46573.31,46704.97,46402.2,13100,Asmita Sood's favourite holiday destinations a...,-0.9994,0.105,0.82,0.075
2020-12-17,46774.32,46992.57,46627.6,17200,Horoscope Today; 17 December 2020: Check astro...,-0.9997,0.123,0.801,0.076
2020-12-18,47026.02,47026.02,46630.31,12800,Cops; panch witnesses sniffed suspects' mouths...,-0.9998,0.132,0.78,0.088
2020-12-21,46932.18,47055.69,44923.08,24400,Horoscope Today; 21 December 2020: Check astro...,-0.9995,0.103,0.832,0.065
2020-12-22,45529.61,46080.18,45112.19,25100,Addite and Mohit Malik: We are looking forward...,-0.9999,0.158,0.742,0.099
2020-12-23,46072.3,46513.32,45899.1,10500,"Sawmill in Makarpura GIDC gutted; none hurt,Ci...",-0.9996,0.12,0.8,0.08


In [18]:

# calculating data_to_use
percentage_of_data = 1.0
data_to_use = int(percentage_of_data*(len(stock_data)-1))

# using 80% of data for training
train_end = int(data_to_use*0.8)
total_data = len(stock_data)
start = total_data - data_to_use

# printing number of records in the training and test datasets
print("Number of records in Training Data:", train_end)
print("Number of records in Test Data:", total_data - train_end)

Number of records in Training Data: 11
Number of records in Test Data: 4
