In [623]:
import numpy as np
import pandas as pd
import os
import nltk

In [624]:
pd.set_option('display.max_colwidth', -1)

In [625]:
# Load headlines2.csv
path2 = os.path.join(os.path.abspath('headlines2.csv'))
df_headlines = pd.read_csv(path2)
df_headlines

Unnamed: 0,date,source,headline
0,2020-04-14,Bloomberg,Tesla Wooed by $1 Billion Missouri Package for Cybertruck Plant
1,2020-04-14,CNBC,Tesla shares pop after Credit Suisse upgrades stock says company will increase electric car lead
2,2020-04-14,CNBC,Tesla shares jump after hours as Goldman initiates with buy rating sees rally above $800
3,2020-04-14,CNBC,Analysts are bullish on stocks like Tesla Netflix and Zoom as earnings season gets underway
4,2020-04-14,CNBC,What to watch today: Dow set to jump at open as earnings seasons kicks off
...,...,...,...
162,2020-05-14,CNBC,Tesla's secret batteries aim to rework the math for electric cars and the grid
163,2020-05-14,CNBC,Ex-Google CEO Eric Schmidt: Employers shouldn't force workers to return 'under fear of losing their jobs'
164,2020-05-14,CNBC,Elon Musk appears to be selling more California properties after pledging to 'own no house'
165,2020-05-14,CNBC,Coronavirus live updates: Japan's Takeda treatment trial could start in July global cases top 4.3 million


In [626]:
# Create empty column to store sentiment scores per article
df_headlines['score'] = np.nan
df_headlines

Unnamed: 0,date,source,headline,score
0,2020-04-14,Bloomberg,Tesla Wooed by $1 Billion Missouri Package for Cybertruck Plant,
1,2020-04-14,CNBC,Tesla shares pop after Credit Suisse upgrades stock says company will increase electric car lead,
2,2020-04-14,CNBC,Tesla shares jump after hours as Goldman initiates with buy rating sees rally above $800,
3,2020-04-14,CNBC,Analysts are bullish on stocks like Tesla Netflix and Zoom as earnings season gets underway,
4,2020-04-14,CNBC,What to watch today: Dow set to jump at open as earnings seasons kicks off,
...,...,...,...,...
162,2020-05-14,CNBC,Tesla's secret batteries aim to rework the math for electric cars and the grid,
163,2020-05-14,CNBC,Ex-Google CEO Eric Schmidt: Employers shouldn't force workers to return 'under fear of losing their jobs',
164,2020-05-14,CNBC,Elon Musk appears to be selling more California properties after pledging to 'own no house',
165,2020-05-14,CNBC,Coronavirus live updates: Japan's Takeda treatment trial could start in July global cases top 4.3 million,


In [627]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [628]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/asamra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [629]:
# Tokenize essentially splits a sentence into words
headline_0 = word_tokenize(df_headlines['headline'][0])
print(headline_0)

['Tesla', 'Wooed', 'by', '$', '1', 'Billion', 'Missouri', 'Package', 'for', 'Cybertruck', 'Plant']


In [630]:
from nltk.corpus import stopwords

In [631]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/asamra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [632]:
# Stop words are words that provide no sentiment meaning
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [633]:
# Append ':' to stop_words list
stop_words.append(':')
#print(stop_words)

In [634]:
# Remove stop words from headline at index 0
filtered_headline_0 = []
for word in headline_0:
    if word not in stop_words:
        filtered_headline_0.append(word)
print(filtered_headline_0)

['Tesla', 'Wooed', '$', '1', 'Billion', 'Missouri', 'Package', 'Cybertruck', 'Plant']


In [635]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [636]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/asamra/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [637]:
sid = SentimentIntensityAnalyzer()

In [638]:
# Create empty lists for each type of word: positive, negative and neurtral
headline_0_pos = []
headline_0_neg = []
headline_0_neu = []

In [639]:
# Built-in algorithm determines sentiment of word
# https://stackoverflow.com/questions/43646877/python-extract-positive-words-from-a-string-using-sentiment-vader/43647056
for word in filtered_headline_0:
    if (sid.polarity_scores(word)['compound']) >= 0.5:
        headline_0_pos.append(word)
    elif (sid.polarity_scores(word)['compound']) <= -0.5:
        headline_0_neg.append(word)
    else:
        headline_0_neu.append(word) 

In [640]:
print(headline_0_pos)

[]


In [641]:
print(headline_0_neg)

[]


In [642]:
print(headline_0_neu)

['Tesla', 'Wooed', '$', '1', 'Billion', 'Missouri', 'Package', 'Cybertruck', 'Plant']


In [672]:
# Create a formula to determine how to assign an article 
#score_0 = ((1/3)*len(headline_0_pos) - (1/3)*len(headline_0_neg))/len(filtered_headline_0)
"""if len(headline_0_neu) >= len(headline_0_pos) and len(headline_0_neu) >= len(headline_0_neg):
    score_0 = 0
elif len(headline_0_pos) > len(headline_0_neg):
    score_0 = 1
else:
    score_0 = -1"""

score_0 = round((1*len(headline_0_pos) - 1*len(headline_0_neg) + 0*len(headline_0_neu))/len(filtered_headline_0),2)

In [670]:
print(score_0)

0.0


In [645]:
# Below, we perform the operations above for every row in the dataframe

In [646]:
len(df_headlines)

167

In [647]:
# Iterate through df_headlines to determine sentiment score for each headline
for index, row in df_headlines.iterrows():
    headline_tokenized = word_tokenize(row['headline'])
    filtered_headline = []
    for word in headline_tokenized:
        if word not in stop_words:
            filtered_headline.append(word)
    #print(filtered_headline)
    headline_pos = []
    headline_neg = []
    headline_neu = []
    for word in filtered_headline:
        if (sid.polarity_scores(word)['compound']) >= 0.5:
            headline_pos.append(word)
        elif (sid.polarity_scores(word)['compound']) <= -0.5:
            headline_neg.append(word)
        else:
            headline_neu.append(word)
    
    score = round((1*len(headline_pos) - 1*len(headline_neg) + 0*len(headline_neu))/len(filtered_headline),2)
    
    """if len(headline_neu) >= len(headline_pos) and len(headline_neu) >= len(headline_neg):
        score = 0
    elif len(headline_pos) > len(headline_neg):
        score = 1
    else:
        score = -1"""
        
    """if len(headline_pos) > len(headline_neg):
        score = 1
    elif len(headline_neg) > len(headline_pos):
        score = -1
    else:
        score = 0"""
    #print(len(headline_neu), len(headline_neg), len(headline_pos))
    df_headlines.at[index,'score'] = score
    print(index, row['date'], score)
    #print(len(headline_pos), len(headline_neg), len(headline_neu))

0 2020-04-14 0.0
1 2020-04-14 0.0
2 2020-04-14 0.0
3 2020-04-14 0.0
4 2020-04-14 0.0
5 2020-04-14 0.0
6 2020-04-14 0.0
7 2020-04-14 0.0
8 2020-04-14 0.0
9 2020-04-14 0.0
10 2020-04-14 0.0
11 2020-04-14 0.0
12 2020-04-15 0.0
13 2020-04-15 0.0
14 2020-04-15 0.0
15 2020-04-15 0.0
16 2020-04-15 0.0
17 2020-04-15 0.0
18 2020-04-15 0.0
19 2020-04-15 0.0
20 2020-04-16 0.0
21 2020-04-17 0.0
22 2020-04-17 0.0
23 2020-04-21 0.0
24 2020-04-22 0.0
25 2020-04-22 0.0
26 2020-04-22 -0.09
27 2020-04-22 0.0
28 2020-04-22 0.0
29 2020-04-22 -0.08
30 2020-04-23 0.0
31 2020-04-24 0.0
32 2020-04-24 0.0
33 2020-04-24 0.0
34 2020-04-25 0.0
35 2020-04-25 0.0
36 2020-04-25 0.0
37 2020-04-26 0.0
38 2020-04-27 0.0
39 2020-04-27 0.0
40 2020-04-27 0.0
41 2020-04-27 0.0
42 2020-04-27 0.0
43 2020-04-27 0.0
44 2020-04-27 0.0
45 2020-04-28 0.0
46 2020-04-28 0.0
47 2020-04-28 0.0
48 2020-04-28 0.0
49 2020-04-28 0.0
50 2020-04-28 0.0
51 2020-04-28 0.0
52 2020-04-28 0.0
53 2020-04-28 0.0
54 2020-04-29 0.0
55 2020-04-29 0.

In [648]:
df_headlines

Unnamed: 0,date,source,headline,score
0,2020-04-14,Bloomberg,Tesla Wooed by $1 Billion Missouri Package for Cybertruck Plant,0.0
1,2020-04-14,CNBC,Tesla shares pop after Credit Suisse upgrades stock says company will increase electric car lead,0.0
2,2020-04-14,CNBC,Tesla shares jump after hours as Goldman initiates with buy rating sees rally above $800,0.0
3,2020-04-14,CNBC,Analysts are bullish on stocks like Tesla Netflix and Zoom as earnings season gets underway,0.0
4,2020-04-14,CNBC,What to watch today: Dow set to jump at open as earnings seasons kicks off,0.0
...,...,...,...,...
162,2020-05-14,CNBC,Tesla's secret batteries aim to rework the math for electric cars and the grid,0.0
163,2020-05-14,CNBC,Ex-Google CEO Eric Schmidt: Employers shouldn't force workers to return 'under fear of losing their jobs',0.0
164,2020-05-14,CNBC,Elon Musk appears to be selling more California properties after pledging to 'own no house',0.0
165,2020-05-14,CNBC,Coronavirus live updates: Japan's Takeda treatment trial could start in July global cases top 4.3 million,0.0


In [649]:
# See score for a particular location to check for accuracy
df_headlines.iloc[28]

date        2020-04-22                                                                                   
source      CNBC                                                                                         
headline    Chamath Palihapitiya: We've 'ripped the philosophical band-aid off' on universal basic income
score       0                                                                                            
Name: 28, dtype: object

In [662]:
# Create a copy of df_headlines to perform grouping operations
df_headlines2 = df_headlines.copy()
df_headlines2.drop(['source', 'headline'], axis=1, inplace=True)

In [663]:
#df_group = df_headlines2.groupby('date')
#df_group

In [664]:
# Find daily score for scores of 1, -1, 0; not for continuous variable
# https://stackoverflow.com/questions/61806725/iterate-over-a-pandas-data-frame-or-groupby-object?noredirect=1#comment109323032_61806725
"""for name, group in df_headlines2.groupby('date'):
    daily_pos = len(group[group['score'] == 1])
    daily_neg = len(group[group['score'] == -1])
    daily_neu = len(group[group['score'] == 0])
    print(name, daily_pos, daily_neg, daily_neu)"""

"for name, group in df_headlines2.groupby('date'):\n    daily_pos = len(group[group['score'] == 1])\n    daily_neg = len(group[group['score'] == -1])\n    daily_neu = len(group[group['score'] == 0])\n    print(name, daily_pos, daily_neg, daily_neu)"

In [668]:
# Group headlines by date and calculate average score
df_group = df_headlines2.groupby('date').mean()
df_group.rename(columns = {'score': 'daily_score'}, inplace = True)
df_group

Unnamed: 0_level_0,daily_score
date,Unnamed: 1_level_1
2020-04-14,0.0
2020-04-15,0.0
2020-04-16,0.0
2020-04-17,0.0
2020-04-21,0.0
2020-04-22,-0.028333
2020-04-23,0.0
2020-04-24,0.0
2020-04-25,0.0
2020-04-26,0.0


In [675]:
df_group.to_csv('sentiment_score.csv')