<h1>Reddit API</h1>

<h3>Imports</h3>

In [1]:
import pandas as pd
from time import sleep
from datetime import datetime
import requests

BEGINN_DATUM = int(datetime(year=2021, month=8, day=1).timestamp())
ENDE_DATUM = int(datetime(year=2021, month=10, day=31).timestamp())

SUBREDDITS = ['CryptoCurrency','Bitcoin','CryptoMarkets','Crypto','CryptoCurrencyTrading','CryptoCurrencies']

<h3>Funktion zum Laden der Posts</h3>

In [2]:
def get_posts_for_time_period(sub, beginning, end=int(datetime.now().timestamp())):
    url = "https://apiv2.pushshift.io/reddit/submission/search/" \
               "?subreddit={0}" \
               "&limit=500" \
               "&after={1}" \
               "&before={2}".format(sub, beginning, end)

    response = requests.get(url)
    if(response.status_code == 429):
        sleep(0.5)
        response = requests.get(url)

    resp_json = response.json()

    return resp_json['data']

<h3>Hole Daten zu Posts</h3>

In [3]:
all_data = None
for subreddit in SUBREDDITS:
    beginning_timestamp = BEGINN_DATUM
    end_timestamp = ENDE_DATUM
    data = get_posts_for_time_period("CryptoCurrency", beginning_timestamp, end_timestamp)
    if all_data == None:
        all_data= data
    else:
        all_data.extend(data)
    while len(data) >= 50:
        # go back for more data
        last_one = data[len(data)-1]
        beginning_timestamp = last_one['created_utc'] + 1
        print('Hole '+subreddit+' für '+str(datetime.fromtimestamp(beginning_timestamp)))
        data = get_posts_for_time_period(sub=subreddit, beginning=beginning_timestamp, end=end_timestamp)
        all_data.extend(data)

Hole CryptoCurrency für 2021-08-01 01:47:37
Hole CryptoCurrency für 2021-08-01 03:45:51
Hole CryptoCurrency für 2021-08-01 06:17:07
Hole CryptoCurrency für 2021-08-01 08:00:17
Hole CryptoCurrency für 2021-08-01 09:53:11
Hole CryptoCurrency für 2021-08-01 12:06:54
Hole CryptoCurrency für 2021-08-01 14:02:29
Hole CryptoCurrency für 2021-08-01 15:31:25
Hole CryptoCurrency für 2021-08-01 16:57:03
Hole CryptoCurrency für 2021-08-01 18:17:41
Hole CryptoCurrency für 2021-08-01 19:42:38
Hole CryptoCurrency für 2021-08-01 21:00:05
Hole CryptoCurrency für 2021-08-01 22:14:21
Hole CryptoCurrency für 2021-08-01 23:54:25
Hole Bitcoin für 2021-08-01 01:47:37
Hole Bitcoin für 2021-08-01 12:01:24
Hole Bitcoin für 2021-08-01 18:27:28
Hole Bitcoin für 2021-08-01 23:47:46
Hole CryptoMarkets für 2021-08-01 01:47:37
Hole CryptoMarkets für 2021-08-01 21:39:10
Hole Crypto für 2021-08-01 01:47:37
Hole CryptoCurrencyTrading für 2021-08-01 01:47:37
Hole CryptoCurrencyTrading für 2021-08-01 06:40:48
Hole CryptoC

<h3>Lege benötigte Daten in Datenframe ab</h3>

In [4]:
df = pd.DataFrame()  # initialize dataframe

for post in all_data:
    if 'selftext' in post:
        df = df.append({
            'subreddit': post['subreddit'],
            'title': post['title'],
            'selftext': post['selftext'],
            'upvoteRatio': post['upvote_ratio'],
            'created': datetime.fromtimestamp(post['created_utc'])
        }, ignore_index=True)

df.head()

Unnamed: 0,created,selftext,subreddit,title,upvoteRatio
0,2021-08-01 00:00:58,,CryptoCurrency,Bitcoin Mining Difficulty Increases For First ...,1.0
1,2021-08-01 00:01:09,[removed],CryptoCurrency,Sunsetcrypto Finance,1.0
2,2021-08-01 00:01:10,[removed],CryptoCurrency,Can anyone give info about bnb diamond? Is thi...,1.0
3,2021-08-01 00:01:27,We’ve all been there seeing a massive pump of ...,CryptoCurrency,Don’t Go Chasing Waterfalls Because You Will E...,1.0
4,2021-08-01 00:01:50,Almost everyones goal would be to achieve bein...,CryptoCurrency,"Financial independence is the goal, but what t...",1.0


<h2> Preprocessing </h2>

<h4> Filtern auf Subreddit </h4>

In [5]:
df_filtered = df[df['subreddit'].apply(lambda x: x in SUBREDDITS)]

<h4> Entfernen Posts ohne Text </h4>

In [6]:
df_filtered = df_filtered[df_filtered['selftext']!='[removed]']
df_filtered = df_filtered[df_filtered['selftext']!='']

<h4> Entfernen der Umbrüche </h4>

In [7]:
df_filtered['selftext'] = df_filtered['selftext'].apply(lambda x: x.replace("\n"," "))

<h4> Schreiben in csv Datei </h4>

In [8]:
df_filtered.to_csv('redditdaten.csv')

In [9]:
df.to_csv('unfiltered.csv')