<h1>Reddit API</h1>

<h3>Imports</h3>

In [1]:
import pandas as pd
from time import sleep
from datetime import datetime
import requests

BEGINN_DATUM = int(datetime(year=2021, month=8, day=1).timestamp())
ENDE_DATUM = int(datetime(year=2021, month=8, day=2).timestamp())

SUBREDDITS = ['CryptoCurrency','Bitcoin']

<h3>Funktion zum Laden der Posts</h3>

In [2]:
def get_posts_for_time_period(sub, beginning, end=int(datetime.now().timestamp())):
    print("Querying pushshift")
    url = "https://api.pushshift.io/reddit/submission/search/" \
               "?q={0}" \
               "&limit=500" \
               "&after={1}" \
               "&before={2}".format(sub, beginning, end)

    response = requests.get(url)
    if(response.status_code == 429):
        print("Pause für HTTP-TooManyRequests")
        sleep(0.5)
        response = requests.get(url)

    resp_json = response.json()

    return resp_json['data']

<h3>Hole Daten zu Posts</h3>

In [3]:
all_data = None
for subreddit in SUBREDDITS:
    beginning_timestamp = BEGINN_DATUM
    end_timestamp = ENDE_DATUM
    data = get_posts_for_time_period("CryptoCurrency", beginning_timestamp, end_timestamp)
    if all_data == None:
        all_data= data
    else:
        all_data.extend(data)
    while len(data) >= 50:
        # go back for more data
        last_one = data[len(data)-1]
        beginning_timestamp = last_one['created_utc'] + 1
        print('Hole Daten für '+str(datetime.fromtimestamp(beginning_timestamp)))
        data = get_posts_for_time_period(sub="CryptoCurrency", beginning=beginning_timestamp, end=end_timestamp)
        all_data.extend(data)

Querying pushshift
Hole Daten für 2021-08-01 00:58:01
Querying pushshift
Hole Daten für 2021-08-01 02:03:47
Querying pushshift
Hole Daten für 2021-08-01 03:00:17
Querying pushshift
Hole Daten für 2021-08-01 04:11:34
Querying pushshift
Hole Daten für 2021-08-01 05:31:51
Querying pushshift
Hole Daten für 2021-08-01 06:32:06
Querying pushshift
Hole Daten für 2021-08-01 07:40:57
Querying pushshift
Hole Daten für 2021-08-01 08:12:36
Querying pushshift
Hole Daten für 2021-08-01 09:13:47
Querying pushshift
Hole Daten für 2021-08-01 10:19:38
Querying pushshift
Hole Daten für 2021-08-01 11:17:45
Querying pushshift
Hole Daten für 2021-08-01 12:11:21
Querying pushshift
Hole Daten für 2021-08-01 12:54:09
Querying pushshift
Hole Daten für 2021-08-01 13:43:57
Querying pushshift
Hole Daten für 2021-08-01 15:14:28
Querying pushshift
Hole Daten für 2021-08-01 16:15:10
Querying pushshift
Hole Daten für 2021-08-01 17:11:04
Querying pushshift
Hole Daten für 2021-08-01 18:15:40
Querying pushshift
Hole Date

<h3>Lege benötigte Daten in Datenframe ab</h3>

In [4]:
df = pd.DataFrame()  # initialize dataframe

for post in all_data:
    df = df.append({
        'subreddit': post['subreddit'],
        'title': post['title'],
        'selftext': post['selftext'].strip(),
        'upvoteRatio': post['upvote_ratio'],
        'created': datetime.fromtimestamp(post['created_utc'])
    }, ignore_index=True)

df.head()

Unnamed: 0,created,selftext,subreddit,title,upvoteRatio
0,2021-08-01 00:00:40,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On P...,CryptocurrencyICO,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On Pa...,1.0
1,2021-08-01 00:00:59,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On P...,MarsWallStreet,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On Pa...,1.0
2,2021-08-01 00:01:24,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On P...,CryptoMarsShots,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On Pa...,1.0
3,2021-08-01 00:01:47,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On P...,CryptoMoon,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On Pa...,1.0
4,2021-08-01 00:02:13,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On P...,CryptoMars,🍕HOTCAKE🍕 ⏰ 1 SECOND OLD ON BSC 🍕 Listed On Pa...,1.0


<h2> Preprocessing </h2>

<h4> Entfernen Posts ohne Text </h4>

<h4> Filtern auf Subreddit </h4>

In [9]:
df_filtered = df[df['subreddit'].apply(lambda x: x in SUBREDDITS)]

<h4> Entfernen Posts ohne Text </h4>

In [11]:
df_filtered = df_filtered[df_filtered['selftext']!='[removed]']
df_filtered = df_filtered[df_filtered['selftext']!='']

<h4> Entfernen der Umbrüche </h4>

In [12]:
df_filtered['selftext'] = df_filtered['selftext'].apply(lambda x: x.replace("\n"," "))

<h4> Schreiben in csv Datei </h4>

In [13]:
df_filtered.to_csv('redditdaten.csv')
