In [1]:
import numpy as np
import pandas as pd
import torch 
import torchvision
import transformers 
import datetime as dt

In [4]:
data = pd.read_csv('~/Downloads/MyCourses/StockMarketSentimentAnalysis/AMZN_prerob.csv')

In [5]:
data.head()

Unnamed: 0,Publishing Time,Ticker,Sector,Source,Headline,Text,URL,fin_sentiment,finvader_tot,rob_sentiment,rob_score,stock_time,open,high,low,close,volume,numtrades,vwap
0,,,,,,,,,,,,2019-03-15 08:00:00+00:00,1706.0,1711.0,1706.0,1711.0,1497,61,1707.697582
1,,,,,,,,,,,,2019-03-15 08:15:00+00:00,1708.67,1708.67,1708.67,1708.67,230,4,1708.561217
2,,,,,,,,,,,,2019-03-15 09:00:00+00:00,1709.24,1709.24,1709.11,1709.11,605,7,1709.173884
3,,,,,,,,,,,,2019-03-15 09:15:00+00:00,1708.0,1708.0,1708.0,1708.0,200,1,1708.0
4,,,,,,,,,,,,2019-03-15 09:30:00+00:00,1708.0,1708.0,1707.98,1707.98,201,3,1707.987413


In [6]:
data.columns

Index(['Publishing Time', 'Ticker', 'Sector', 'Source', 'Headline', 'Text',
       'URL', 'fin_sentiment', 'finvader_tot', 'rob_sentiment', 'rob_score',
       'stock_time', 'open', 'high', 'low', 'close', 'volume', 'numtrades',
       'vwap'],
      dtype='object')

In [7]:
# Converting the stock_times into datetime objects
data['stock_time'] = pd.to_datetime(data['stock_time'])

In [8]:
# extracting indices of times immediately after a news article was published
publishing_indices = data[data['Publishing Time'].notna()].index

In [9]:
averages = []
for i in range(len(publishing_indices) + 1):
    if i == 0:
        start, end = 0 , publishing_indices[0]
    elif i == len(publishing_indices):
        start, end = publishing_indices[-1], len(data) - 1
    else:
        start, end = publishing_indices[i - 1], publishing_indices[i]
#     print(f'start is {start} end is {end}')
    averages.append(data['open'][start:end].mean())

In [10]:
len(publishing_indices)

14028

In [10]:
len(averages)

14029

In [11]:
unique_sentiments = data['rob_sentiment'][data['rob_sentiment'].notna()].unique()

In [12]:
unique_sentiments

array(['positive', 'neutral', 'negative'], dtype=object)

In [13]:
rob_sentiment = data['rob_sentiment'][data['rob_sentiment'].notna()]
rob_sentiment = list(rob_sentiment)

In [14]:
matching_with_sentiment = []
for i in range(1 , len(averages)-1):
    diff = averages[i] - averages[i - 1]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment.append([False , i , diff , rob_sentiment[i]])

matching_with_sentiment

[[False, 1, -2.24048461538473, 'positive'],
 [True, 2, 5.295857142857358, 'positive'],
 [False, 4, -2.8864999999998417, 'positive'],
 [False, 5, -7.229000000000042, 'positive'],
 [True, 6, 4.186388888888814, 'positive'],
 [True, 7, 3.2386111111111404, 'positive'],
 [True, 10, 15.861825806451634, 'positive'],
 [True, 11, 16.39521705069137, 'positive'],
 [True, 12, -3.9820428571429147, 'negative'],
 [True, 16, -2.32204999999999, 'negative'],
 [False, 18, -7.677379679144224, 'positive'],
 [True, 20, 3.2228333333330283, 'positive'],
 [True, 23, 2.3200000000001637, 'positive'],
 [True, 24, 4.95844444444424, 'positive'],
 [True, 26, 2.3102957142855303, 'positive'],
 [True, 27, -4.572063492063307, 'negative'],
 [True, 29, 3.6520799999998417, 'positive'],
 [True, 30, 15.003474761904727, 'positive'],
 [False, 31, -7.7426297619049365, 'positive'],
 [False, 32, -6.249991666666347, 'positive'],
 [True, 33, -10.006233333333284, 'negative'],
 [False, 34, -11.440000000000055, 'positive'],
 [True, 35,

In [22]:
# Percentage of the average of the stock price differences (before and after publish) that 
# is supported by the sentiment of the roberta model
avg = sum([matching_with_sentiment[i][0] == True for i in range(len(matching_with_sentiment))]) / len(matching_with_sentiment)
print(f'Average percentage of times the articles\' sentiment matched the average stock price movement for Amazon is:\n {avg *100}')

Average percentage of times the articles' sentiment matched the average stock price movement for Amazon is:
 43.159013103529695


In [16]:
[element for element in matching_with_sentiment if element[0] == False]

[[False, 1, -2.24048461538473, 'positive'],
 [False, 4, -2.8864999999998417, 'positive'],
 [False, 5, -7.229000000000042, 'positive'],
 [False, 18, -7.677379679144224, 'positive'],
 [False, 31, -7.7426297619049365, 'positive'],
 [False, 32, -6.249991666666347, 'positive'],
 [False, 34, -11.440000000000055, 'positive'],
 [False, 42, -3.8013947981362435, 'positive'],
 [False, 43, -21.05434396914461, 'positive'],
 [False, 50, 0.0, 'positive'],
 [False, 51, 0.0, 'positive'],
 [False, 52, -2.986666666666679, 'positive'],
 [False, 58, -0.27749999999991815, 'positive'],
 [False, 62, -1.1700000000000728, 'positive'],
 [False, 63, -2.7940000000000964, 'positive'],
 [False, 64, 3.3756750000002285, 'negative'],
 [False, 65, -1.1966750000001412, 'positive'],
 [False, 68, -0.8638699999999062, 'positive'],
 [False, 73, 0.0, 'positive'],
 [False, 76, 1.71273846153872, 'negative'],
 [False, 77, -7.256026470588267, 'positive'],
 [False, 79, -2.2316666666663423, 'positive'],
 [False, 83, -0.866499999999

It turns out the average from the time an article is published until the next publishing article is not an accurate measure of a news. Let's focus on the ***'close' - 'open'*** in the time slot a news is published

In [23]:
matching_with_sentiment_diff = []
for i in range(len(publishing_indices)):
    diff = data['close'][i] - data['open'][i]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment_diff.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment_diff.append([False , i , diff , rob_sentiment[i]])
avg = sum([matching_with_sentiment_diff[i][0] == True for i in range(len(matching_with_sentiment_diff))]) / len(matching_with_sentiment_diff)
print(f'Average percentage of times the articles\' sentiment matches close - open price movement for Amazon is:\n {avg *100}')

Average percentage of times the articles' sentiment matches close - open price movement for Amazon is:
 44.405099150141645


What if we let the news sink a bit and look at the price some 2 hours later?! We get slightly better results than flipping of a coin but still very much looks like a gamble.

In [24]:
matching_with_sentiment_diff = []
for i in range(len(publishing_indices)):
    diff = data['close'][i+9] - data['open'][i]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment_diff.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment_diff.append([False , i , diff , rob_sentiment[i]])
avg = sum([matching_with_sentiment_diff[i][0] == True for i in range(len(matching_with_sentiment_diff))]) / len(matching_with_sentiment_diff)
print(f'The result of waiting for a while (2 hours in this case) is {avg * 100}, slightly better!')

The result of waiting for a while (2 hours in this case) is 50.849858356940516


Let's look at Apple

In [25]:
data = pd.read_csv('~/Downloads/MyCourses/StockMarketSentimentAnalysis/AAPL_prerob.csv')

# Converting the stock_times into datetime objects
data['stock_time'] = pd.to_datetime(data['stock_time'])

# extracting indices of times immediately after a news article was published
publishing_indices = data[data['Publishing Time'].notna()].index

averages = []
for i in range(len(publishing_indices) + 1):
    if i == 0:
        start, end = 0 , publishing_indices[0]
    elif i == len(publishing_indices):
        start, end = publishing_indices[-1], len(data) - 1
    else:
        start, end = publishing_indices[i - 1], publishing_indices[i]
#     print(f'start is {start} end is {end}')
    averages.append(data['open'][start:end].mean())
    
rob_sentiment = data['rob_sentiment'][data['rob_sentiment'].notna()]
rob_sentiment = list(rob_sentiment)

In [26]:
matching_with_sentiment = []
for i in range(1 , len(averages)-1):
    diff = averages[i] - averages[i - 1]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment.append([False , i , diff , rob_sentiment[i]])

avg = sum([matching_with_sentiment[i][0] == True for i in range(len(matching_with_sentiment))]) / len(matching_with_sentiment)
print(f'Average percentage of times the articles\' sentiment matched the average stock price movement is {avg *100}')

Average percentage of times the articles' sentiment matched the average stock price movement is 44.91823899371069


In [27]:
matching_with_sentiment_diff = []
for i in range(len(publishing_indices)):
    diff = data['close'][i] - data['open'][i]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment_diff.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment_diff.append([False , i , diff , rob_sentiment[i]])
open_close = sum([matching_with_sentiment_diff[i][0] == True for i in range(len(matching_with_sentiment_diff))]) / len(matching_with_sentiment_diff)
print(f'Average percentage of times the articles\' sentiment matched the stock price movement at the close relative to the opening price {open_close *100}')

Average percentage of times the articles' sentiment matched the stock price movement at the close relative to the opening price 46.723682555653376


In [28]:
matching_with_sentiment_diff = []
for i in range(len(publishing_indices)):
    diff = data['close'][i + 14] - data['open'][i]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment_diff.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment_diff.append([False , i , diff , rob_sentiment[i]])
open_close = sum([matching_with_sentiment_diff[i][0] == True for i in range(len(matching_with_sentiment_diff))]) / len(matching_with_sentiment_diff)
print(f'Average percentage of times the articles\' sentiment matched the stock price movement at the close some three hours after the publishing of the article relative to the opening price {open_close *100}')

Average percentage of times the articles' sentiment matched the stock price movement at the close some three hours after the publishing of the article relative to the opening price 50.106904791850084


### The news sentiment was more correlated with the price change of a less-known stock as in Eli Lilly

In [29]:
data = pd.read_csv('~/Downloads/MyCourses/StockMarketSentimentAnalysis/LLY_prerob.csv')

# Converting the stock_times into datetime objects
data['stock_time'] = pd.to_datetime(data['stock_time'])

# extracting indices of times immediately after a news article was published
publishing_indices = data[data['Publishing Time'].notna()].index

averages = []
for i in range(len(publishing_indices) + 1):
    if i == 0:
        start, end = 0 , publishing_indices[0]
    elif i == len(publishing_indices):
        start, end = publishing_indices[-1], len(data) - 1
    else:
        start, end = publishing_indices[i - 1], publishing_indices[i]
#     print(f'start is {start} end is {end}')
    averages.append(data['open'][start:end].mean())
    
rob_sentiment = data['rob_sentiment'][data['rob_sentiment'].notna()]
rob_sentiment = list(rob_sentiment)

In [30]:
matching_with_sentiment = []
for i in range(1 , len(averages)-1):
    diff = averages[i] - averages[i - 1]
    if (diff > 0 and rob_sentiment[i] == 'positive') or (diff < 0 and rob_sentiment[i] == 'negative'):
        matching_with_sentiment.append([True , i , diff , rob_sentiment[i]])
    elif rob_sentiment[i] == 'neutral':
        continue
    else:
        matching_with_sentiment.append([False , i , diff , rob_sentiment[i]])

avg = sum([matching_with_sentiment[i][0] == True for i in range(len(matching_with_sentiment))]) / len(matching_with_sentiment)
print(f'Average percentage of times the articles\' sentiment matched the average stock price movement is {avg *100}')

Average percentage of times the articles' sentiment matched the average stock price movement is 54.9717057396928


In [31]:
data = pd.read_csv('~/Downloads/MyCourses/StockMarketSentimentAnalysis/news_openai_final.csv')
data.head()

Unnamed: 0,Publishing Time,Ticker,Sector,Source,Headline,Text,openai_sentiment,openai_score
0,2019-03-15 10:46:42+00:00,WFC,Finance,The Motley Fool,Did Wells Fargo CEO Tim Sloan Earn His $1 Mill...,We learned this week that the scandal-plagued ...,-1.0,-0.5
1,2019-03-15 10:47:26+00:00,AAPL,Technology,The Motley Fool,Don't Underestimate Apple's iPhone Business,The segment is an invaluable asset to Apple's ...,1.0,0.75
2,2019-03-15 11:33:00+00:00,MA,Finance,Forbes,A Closer Look At Mastercard's Key Value Drivers,Mastercard has consistently beat street estima...,1.0,0.8
3,2019-03-15 11:52:45+00:00,BAC,Finance,Benzinga,Jim Cramer Gives His Opinion On Bank Of Americ...,"On CNBC's ""Mad Money Lightning Round"", Jim Cra...",1.0,0.5
4,2019-03-15 13:29:39+00:00,GOOGL,Technology,Benzinga,Uber And Waymo Seeking Outside Funding For Aut...,Commercially viable autonomous vehicle (AV) te...,0.0,0.1
