# The Sparks Foundation - GRIP Data Science and Business Analytics Internship

## Task: Stock Market Prediction using Numerical and Textual Analysis 

## Objective: 
### Create a hybrid model for stock price/performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines

#### Stock to analyze and predict - Reliance (RIL)
#### Historical Stock Price Data - Yahoo Finance API: yfinance
#### Textual Data - https://bit.ly/36fFPI6

In [1]:
# Importing required libraries
import pandas as pd
!pip install yfinance
import yfinance as yf
import datetime as dt

# Importing NLTK Vader for sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Mounting google drive
from google.colab import drive
drive.mount('/content/gdrive')

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/a7/ee/315752b9ef281ba83c62aa7ec2e2074f85223da6e7e74efb4d3e11c0f510/yfinance-0.1.59.tar.gz
Collecting lxml>=4.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/30/c0/d0526314971fc661b083ab135747dc68446a3022686da8c16d25fcf6ef07/lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3MB)
[K     |████████████████████████████████| 6.3MB 9.2MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.59-py2.py3-none-any.whl size=23455 sha256=e31b5f93986a47520f83c2b8caf0d4b1b22334ff5513cd51b71fc0034dfdfc47
  Stored in directory: /root/.cache/pip/wheels/f8/2a/0f/4b5a86e1d52e451757eb6bc17fd899629f0925c777741b6d04
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfu



Mounted at /content/gdrive


In [2]:
# Function to get the news data
def get_news_data(link):
  # Reading News Data from csv link
  news_data = pd.read_csv(link)

  # Converting Publish Date to DateTime Format
  news_data['publish_date']  = pd.to_datetime(news_data['publish_date'].astype('str'), format = '%Y%m%d')

  #Returning news_data df
  return news_data

In [3]:
# Getting News Data
link = '/content/gdrive/MyDrive/Colab Notebooks/india-news-headlines.csv'
news_data = get_news_data(link)
news_data.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,2001-01-02,unknown,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-02,unknown,Fissures in Hurriyat over Pak visit
2,2001-01-02,unknown,America's unwanted heading for India?
3,2001-01-02,unknown,For bigwigs; it is destination Goa
4,2001-01-02,unknown,Extra buses to clear tourist traffic


##### Inference:
1. The publish_date column has been converted to DateTime Format.
2. We can observe that there are multiple headlines for a particular day

In [14]:
# Calculating Polarity scores for headlines
analyzer = SentimentIntensityAnalyzer() #initializing Sentiment Intensity Analyzer
news_data['Compound'] = [analyzer.polarity_scores(h)['compound'] for h in news_data['headline_text']]
news_data['Neutral'] = [analyzer.polarity_scores(h)['neu'] for h in news_data['headline_text']]
news_data['Positive'] = [analyzer.polarity_scores(h)['pos'] for h in news_data['headline_text']]
news_data['Negative'] = [analyzer.polarity_scores(h)['neg'] for h in news_data['headline_text']]

In [18]:
sentiments = {} #initializing a sentiments dict

# Caculating average polarity score for a given date
for date in news_data['publish_date'].unique():
  temp_1 = news_data[news_data['publish_date'] == date]['Compound']
  temp_2 = news_data[news_data['publish_date'] == date]['Neutral']
  temp_3 = news_data[news_data['publish_date'] == date]['Positive']
  temp_4 = news_data[news_data['publish_date'] == date]['Negative']

  avg_compound_score = temp_1.sum()/len(temp_1)
  avg_neutral_score = temp_2.sum()/len(temp_2)
  avg_positive_score = temp_3.sum()/len(temp_3)
  avg_negative_score = temp_4.sum()/len(temp_4)

  sentiments[date] = {'Compound': avg_compound_score,'Neutral': avg_neutral_score,'Positive': avg_positive_score, 'Negative': avg_negative_score}

# Storing the required results in a Data Frame
sentiment_df = pd.DataFrame()
sentiment_df['Date'] = news_data['publish_date'].unique()
sentiment_df['Compound'] = [sentiments[date]['Compound'] for date in news_data['publish_date'].unique()]
sentiment_df['Neutral'] = [sentiments[date]['Neutral'] for date in news_data['publish_date'].unique()]
sentiment_df['Positive'] = [sentiments[date]['Positive'] for date in news_data['publish_date'].unique()]
sentiment_df['Negative'] = [sentiments[date]['Negative'] for date in news_data['publish_date'].unique()]

sentiment_df.to_csv('sentiments.csv') #exporting to csv


In [17]:
# Getting sentiments for news data
sentiment_df.head()

Unnamed: 0,Date,Compound,Neutral,Positive,Negative
0,2001-01-02,-0.018231,0.831279,0.071965,0.071965
1,2001-01-03,-0.0148,0.812244,0.086976,0.086976
2,2001-01-04,0.009842,0.838464,0.085294,0.085294
3,2001-01-05,0.028646,0.772841,0.128333,0.128333
4,2001-01-06,-0.019683,0.768917,0.122917,0.122917


In [19]:
# Function to get stock data
def get_historical_data(ticker, start, end):
  historical_data = yf.download(tickers=ticker, start=start, end = end) #using yfinance library to download stock data
  historical_data.to_csv(ticker.split()[0]+'_historical_data.csv') #exporting the data to csv
  #returning the historical data of the stock
  return historical_data

In [21]:
# Getting Stock Data for Reliance
ticker = 'RELIANCE.NS'
start = list(sentiment_df['Date'])[0].strftime('%Y-%m-%d')
end = list(sentiment_df['Date'])[-1].strftime('%Y-%m-%d')
historical_data = get_historical_data(ticker, start, end)
historical_data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-02,52.661041,55.546684,52.645569,54.819473,42.817074,42810928
2001-01-03,54.92778,55.964447,54.618328,55.709148,43.511967,57159622
2001-01-04,55.562157,56.769024,55.113453,55.361015,43.240051,38666386
2001-01-05,55.15987,56.753551,55.15987,56.366734,44.02557,48215173
2001-01-08,56.088226,56.854122,55.275913,55.732357,43.53009,44224788


In [25]:
# Merging the two dataframes
data = pd.merge(left = historical_data, right = sentiment_df, on = 'Date')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Compound,Neutral,Positive,Negative
0,2001-01-02,52.661041,55.546684,52.645569,54.819473,42.817074,42810928,-0.018231,0.831279,0.071965,0.096756
1,2001-01-03,54.92778,55.964447,54.618328,55.709148,43.511967,57159622,-0.0148,0.812244,0.086976,0.100805
2,2001-01-04,55.562157,56.769024,55.113453,55.361015,43.240051,38666386,0.009842,0.838464,0.085294,0.076235
3,2001-01-05,55.15987,56.753551,55.15987,56.366734,44.02557,48215173,0.028646,0.772841,0.128333,0.09881
4,2001-01-08,56.088226,56.854122,55.275913,55.732357,43.53009,44224788,-0.055812,0.819682,0.08197,0.098348


In [24]:
# Exporting data to csv
data.to_csv('ril_data.csv')