generated from sbreitenbach/python-quick-start
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyzer.py
107 lines (86 loc) · 3.4 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import demoji
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def count_tickers(tickers):
result = Counter(tickers)
return result
def most_common_tickers(tickers, maximum=10):
result = tickers.most_common(maximum)
return result
def preprocess_and_split_text(text):
wordlist = []
text_without_emojis = demoji.replace_with_desc(text, sep=" ")
words = text_without_emojis.split()
for word in words:
wordlist.append(word.lower())
return wordlist
def has_positive_words(text):
positive_words = ['call', 'calls', 'bull', 'bulls', 'bullish', 'diamond',
'gem', 'hold', 'holding', 'liftoff', 'moon', 'rocket', 'yolo']
words = preprocess_and_split_text(text)
if(any(item in words for item in positive_words)):
return True
else:
return False
def has_negative_words(text):
negative_words = ['bear', 'bears', 'bearish', 'imagine',
'put', 'puts', 'rainbow', 'sell', 'short']
words = preprocess_and_split_text(text)
if(any(item in words for item in negative_words)):
return True
else:
return False
def get_vader_score(text):
VADER_analyzer = SentimentIntensityAnalyzer()
vs = VADER_analyzer.polarity_scores(text)
compound_score = vs['compound']
return compound_score
# This may not be accurate and could easily be manipliated
# VADER is trained used social media data and the custom scoring for certain words may help
# Best way to approach this would likely be a custom trained model for each data source
def determine_sentiment(text):
VADER_score = get_vader_score(text)
if(has_negative_words(text) and has_positive_words(text)):
return VADER_score
elif(has_positive_words(text)):
adjusted_score = VADER_score + .02
return adjusted_score
elif(has_negative_words(text)):
adjusted_score = VADER_score - .02
return adjusted_score
else:
return VADER_score
def trim_post_list(most_common_tickers, post_list):
most_common_list = []
trimmed_post_list = []
for i in most_common_tickers:
common_ticker = i[0]
most_common_list.append(common_ticker)
for post in post_list:
post_ticker = post[0]
post_text = post[1]
if post_ticker in most_common_list:
ticker_post = [post_ticker, post_text]
trimmed_post_list.append(ticker_post)
return trimmed_post_list
def calculate_net_sentiment(most_common_tickers, post_list):
result = []
for i in most_common_tickers:
common_ticker = i[0]
occurances = i[1]
sentiment_sum = 0
average_sentiment = 0
for post in post_list:
post_ticker = post[0]
post_text = post[1]
if post_ticker == common_ticker:
post_sentiment = determine_sentiment(post_text)
sentiment_sum = sentiment_sum + post_sentiment
# This could count the same ticker again if it's mentioned multiple times in a post
# E.g. $GME $GME $GME to the moon!!! would be triple counted...
# Could possiby use some kind of post id to prevent this
average_sentiment = round((sentiment_sum/occurances), 4)
scores = {common_ticker: {'mentions': occurances,
'sentiment': average_sentiment}}
result.append(scores)
return result