<h1>Reddit API</h1>

<h3>Imports</h3>

In [4]:
import pandas as pd
from time import sleep
from datetime import datetime
import requests

BEGINN_DATUM = int(datetime(year=2021, month=8, day=1).timestamp())
ENDE_DATUM = int(datetime(year=2021, month=10, day=31).timestamp())

SUBREDDITS = ['CryptoCurrency','Bitcoin','CryptoMarkets','Crypto','CryptoCurrencyTrading','CryptoCurrencies']

<h3>Funktion zum Laden der Posts</h3>

In [5]:
def get_posts_for_time_period(sub, beginning, end=int(datetime.now().timestamp())):
    url = "https://apiv2.pushshift.io/reddit/submission/search/" \
               "?subreddit={0}" \
               "&limit=500" \
               "&after={1}" \
               "&before={2}".format(sub, beginning, end)

    response = requests.get(url)
    if(response.status_code == 429):
        sleep(1)
        response = requests.get(url)

    resp_json = response.json()

    return resp_json['data']

<h3>Hole Daten zu Posts</h3>

In [6]:
all_data = None
for subreddit in SUBREDDITS:
    beginning_timestamp = BEGINN_DATUM
    end_timestamp = ENDE_DATUM
    data = get_posts_for_time_period(subreddit, beginning_timestamp, end_timestamp)
    startday= 0
    if all_data == None:
        all_data= data
    else:
        all_data.extend(data)
    while len(data) >= 50:
        # go back for more data
        last_one = data[len(data)-1]
        beginning_timestamp = last_one['created_utc'] + 1
        datetime = datetime.fromtimestamp(beginning_timestamp)
        if datetime.day != startday:
            print('Hole ' + subreddit +' für ' + str(datetime))
            startday = datetime.day
        data = get_posts_for_time_period(sub=subreddit, beginning=beginning_timestamp, end=end_timestamp)
        all_data.extend(data)

Hole CryptoCurrency für 2021-08-01 01:47:37
Hole CryptoCurrency für 2021-08-02 01:04:01
Hole CryptoCurrency für 2021-08-03 01:14:45
Hole CryptoCurrency für 2021-08-04 01:14:07
Hole CryptoCurrency für 2021-08-05 00:52:27
Hole CryptoCurrency für 2021-08-06 01:01:23
Hole CryptoCurrency für 2021-08-07 00:44:37
Hole CryptoCurrency für 2021-08-08 00:04:40
Hole CryptoCurrency für 2021-08-09 00:00:29
Hole CryptoCurrency für 2021-08-10 00:20:57
Hole CryptoCurrency für 2021-08-11 00:45:42
Hole CryptoCurrency für 2021-08-12 00:30:47
Hole CryptoCurrency für 2021-08-13 00:32:16
Hole CryptoCurrency für 2021-08-14 00:32:50
Hole CryptoCurrency für 2021-08-15 00:51:47
Hole CryptoCurrency für 2021-08-16 00:05:01
Hole CryptoCurrency für 2021-08-17 00:28:36
Hole CryptoCurrency für 2021-08-18 00:29:04
Hole CryptoCurrency für 2021-08-19 00:41:27
Hole CryptoCurrency für 2021-08-20 00:10:02
Hole CryptoCurrency für 2021-08-21 00:03:04
Hole CryptoCurrency für 2021-08-22 00:35:26
Hole CryptoCurrency für 2021-08-

<h3>Lege benötigte Daten in Datenframe ab</h3>

In [None]:
df = pd.DataFrame()  # initialize dataframe

for post in all_data:
    if 'selftext' in post:
        df = df.append({
            'subreddit': post['subreddit'],
            'title': post['title'],
            'selftext': post['selftext'],
            'upvoteRatio': post['upvote_ratio'],
            'created': datetime.fromtimestamp(post['created_utc'])
        }, ignore_index=True)

df.head()

<h2> Preprocessing </h2>

<h4> Filtern auf Subreddit </h4>

In [None]:
df_filtered = df[df['subreddit'].apply(lambda x: x in SUBREDDITS)]

<h4> Entfernen Posts ohne Text </h4>

In [None]:
df_filtered = df_filtered[df_filtered['selftext']!='[removed]']
df_filtered = df_filtered[df_filtered['selftext']!='']

<h4> Entfernen der Umbrüche </h4>

In [None]:
df_filtered['selftext'] = df_filtered['selftext'].apply(lambda x: x.replace("\n"," "))

<h4> Schreiben in csv Datei </h4>

In [None]:
df_filtered.to_csv('redditdaten.csv')

In [None]:
df.to_csv('unfiltered.csv')