/
helpers.py
126 lines (108 loc) · 4.38 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import plotly.graph_objs as go
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
########### Set up the default figures ######
def base_fig():
data=go.Table(columnwidth = [200,200,1000],
header=dict(values=['date', 'time', 'post', 'sentiment'], align=['left']),
cells=dict(align=['left'],
values=[[1,2,3],
[1,2,3],
['waiting for data','waiting for data','waiting for data'],
['Neutral','Neutral','Neutral']
])
)
fig = go.Figure([data])
return fig
def error_fig():
data=go.Table(columnwidth = [200,200,1000],
header=dict(values=['date', 'time', 'post', 'sentiment'], align=['left']),
cells=dict(align=['left'],
values=[['whoa!','whoa!','whoa!'],
[3,2,1],
['Slow down!','Scraping takes a sec','Try back later!'],
['Neutral','Neutral','Neutral']
])
)
fig = go.Figure([data])
return fig
def sentiment_scores(sentence):
try:
# Create a SentimentIntensityAnalyzer object.
sid_obj = SentimentIntensityAnalyzer()
sentiment_dict = sid_obj.polarity_scores(sentence)
score = sentiment_dict['compound']
if score >= 0.05 :
final=f"Positive: {round(score,2)}"
elif score <= - 0.05 :
final=f"Negative: {round(score,2)}"
else :
final=f"Neutral: {round(score,2)}"
return final
except:
return "Error"
########### Functions ######
# define a scraper function
def lovely_soup(url):
r = requests.get(url, headers = {'User-agent': 'Agent_Smith'})
return BeautifulSoup(r.text, 'lxml')
# write a function to clean up the post
def clean_that_post(row):
x = row.split(' (self.AskReddit)')
return x[0]
# write a function to clean up the date
def parse_that_date(row):
x = row.split(' ')[1:]
y = ' '.join(x)
z = '2020 '+ y
return z[:20]
########### Scraping ######
def scrape_reddit():
# apply the function to our reddit source
url = 'https://old.reddit.com/r/AskReddit/'
soup = lovely_soup(url)
# create a list of titles
titles = soup.findAll('p', {'class': 'title'})
titleslist=[]
for title in titles:
titleslist.append(title.text)
# create a list of dates
dates = soup.findAll('time', {'class':"live-timestamp"})
dateslist=[]
for date in dates:
output = str(date).split('title="')[1].split('2020')[0]
dateslist.append(output)
########### Pandas work ######
# convert the two lists into a pandas dataframe
df_dict={'date':dateslist, 'post':titleslist}
working_df = pd.DataFrame(df_dict)
pd.set_option('display.max_colwidth', 200)
working_df['date'] = working_df['date'].str.strip()
# apply the function
working_df['post']=working_df['post'].apply(clean_that_post)
# apply the date parsing function and sort the dataframe
working_df['cleandate']=working_df['date'].apply(parse_that_date)
working_df['UTC_date'] = pd.to_datetime(working_df['cleandate'])
working_df.sort_values('UTC_date', inplace=True, ascending=False)
# split into 2 date/time variables
working_df['date']=working_df['UTC_date'].dt.date
working_df['time']=working_df['UTC_date'].dt.time
# add sentiment analysis
working_df['sentiment'] = working_df['post'].apply(sentiment_scores)
# send final df
final_df = working_df[['date', 'time', 'post', 'sentiment']].copy()
########### Set up the figure ######
data=go.Table(columnwidth = [200,200,1000],
header=dict(values=final_df.columns, align=['left']),
cells=dict(align=['left'],
values=[final_df['date'],
final_df['time'],
final_df['post'].values,
final_df['sentiment'].values
])
)
fig = go.Figure([data])
return fig