<a href="https://colab.research.google.com/github/sheshrajkandel/news-sentiment-stock-price-prediction/blob/main/GenerateNewsSentimentScores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Import the required packages**

In [16]:
import nltk
import pandas as pd
import datetime
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#from google.colab import drive
#drive.mount('/content/drive')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
news_data = "/content/drive/MyDrive/Dissertation/NABIL_raw_news_data.csv"
df_news = pd.read_csv(news_data)

df_news.head()

Unnamed: 0,Date,Headline
0,5/27/2022,IJ Global Magazine lists Nabil Bank as TOP 20 ...
1,5/26/2022,Nabil Bank partners with Nepal Medical Associa...
2,5/24/2022,Mr. Gyanendra Prasad Dhungana To Be Appointed ...
3,5/22/2022,Details Regarding The Press Release Of Nabil B...
4,5/19/2022,Nabil Bank signs MoU with Office of Investment...


In [18]:
headlines = df_news["Headline"]
#print(titles)

###Generate Sentiment Analysis Score from News Headlines

In [19]:
output_file = "/content/drive/MyDrive/Dissertation/nabil_news_score_data.csv"

score_list = []
for headline in headlines:
  sid = SentimentIntensityAnalyzer()
  ss = sid.polarity_scores(headline)
  #print(headline)
  #print(ss['compound'])
  score_list.append(ss['compound'])

score_df = pd.DataFrame(score_list)
#print(score_df)

df_news['SA_Score'] = score_df[0]
df_news.to_csv(output_file, index = False)

In [20]:
df_news.head()

Unnamed: 0,Date,Headline,SA_Score
0,5/27/2022,IJ Global Magazine lists Nabil Bank as TOP 20 ...,0.368
1,5/26/2022,Nabil Bank partners with Nepal Medical Associa...,0.4019
2,5/24/2022,Mr. Gyanendra Prasad Dhungana To Be Appointed ...,0.0
3,5/22/2022,Details Regarding The Press Release Of Nabil B...,0.0
4,5/19/2022,Nabil Bank signs MoU with Office of Investment...,0.3818


###**Find the date gaps and fill the sentiment analysis scores with 0**

In [21]:
input_stock_data = "/content/drive/MyDrive/Dissertation/NABIL_raw_stock_data.csv"
df_stock_data = pd.read_csv(input_stock_data)

input_news_score = "/content/drive/MyDrive/Dissertation/nabil_news_score_data.csv"
df_news_score = pd.read_csv(input_news_score)

#Create a column SA_Score and set 0 as default value
df_stock_data['SA_Score'] = 0 

df_news_score['Date'] = pd.to_datetime(df_news_score['Date'], format='%m/%d/%Y').dt.date
df_stock_data['Date'] = pd.to_datetime(df_stock_data['Date'], format='%m/%d/%Y').dt.date
#df_news_score['SA_Score'].apply(lambda x: float(x))

In [22]:
df_stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,%_Change,Volume,SA_Score
0,2022-05-31,890.0,890.0,865.0,875.0,-1.69,45502,0
1,2022-05-30,900.0,901.0,881.0,890.0,-0.89,44382,0
2,2022-05-27,900.0,900.0,896.0,898.0,-0.07,39790,0
3,2022-05-26,918.0,918.0,897.0,898.6,-0.16,39717,0
4,2022-05-25,915.0,915.0,898.0,900.0,0.32,38263,0


In [23]:
df_news_score.head()

Unnamed: 0,Date,Headline,SA_Score
0,2022-05-27,IJ Global Magazine lists Nabil Bank as TOP 20 ...,0.368
1,2022-05-26,Nabil Bank partners with Nepal Medical Associa...,0.4019
2,2022-05-24,Mr. Gyanendra Prasad Dhungana To Be Appointed ...,0.0
3,2022-05-22,Details Regarding The Press Release Of Nabil B...,0.0
4,2022-05-19,Nabil Bank signs MoU with Office of Investment...,0.3818


###**Assign SA_Score of previous date to current date and prepare a final stock dataset with SA_Score**

In [24]:
def get_score(previous_date):
  score = 0.0 
  if(previous_date in df_news_score['Date'].values):
    temp_df = df_news_score.loc[df_news_score['Date'] == previous_date]
    score = temp_df.iloc[0]['SA_Score']
    
  return score

for index, row in df_stock_data.iterrows():
  if(index+1 == len(df_stock_data.index)):
    continue
  else:
    previous_date = df_stock_data.loc[index + 1, 'Date']
  #print(row['Date'], index)
  current_date = row['Date']
  
  score_counter = 0
  total_score = 0.0

  while(current_date > previous_date):
    sa_score = get_score(previous_date)
    #print('SA_Score: ', sa_score)
    if(sa_score != 0):
      total_score = sa_score + total_score
      #print('Total: ', total_score)
      score_counter = score_counter + 1
      #print('Average: ', score_counter)
    previous_date = previous_date + datetime.timedelta(days = 1)
    if(score_counter > 0):
      final_score = total_score / score_counter      
    else:
      final_score = 0.0
    #print('Final Score: ', final_score)
  df_stock_data.loc[index, 'SA_Score'] = final_score

#df_stock_data

In [25]:
df_stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,%_Change,Volume,SA_Score
0,2022-05-31,890.0,890.0,865.0,875.0,-1.69,45502,0.0
1,2022-05-30,900.0,901.0,881.0,890.0,-0.89,44382,0.368
2,2022-05-27,900.0,900.0,896.0,898.0,-0.07,39790,0.4019
3,2022-05-26,918.0,918.0,897.0,898.6,-0.16,39717,0.0
4,2022-05-25,915.0,915.0,898.0,900.0,0.32,38263,0.0


In [26]:
#Write stock data with SA_Score to a new csv file 
stock_score_data = "/content/drive/MyDrive/Dissertation/nabil_stock_score_data.csv"
df_stock_data.to_csv(stock_score_data, index = False)