In [1]:
from datetime import datetime, timedelta
from pushshift_py import PushshiftAPI
import numpy as np
import copy
import yaml
import time
import os

In [2]:
class Collector():
    def __init__(self, start = datetime(2020,1,1,0), end = datetime(2020,1,1,1), query = "AI"):
        self.store_dict = {}
        self.start = start
        self.end = end
        self.query = query
        self.api = PushshiftAPI()
        self.month_dict = {1:"Jan",
                           2: "Feb",
                           3: "Mar",
                           4: "Apr",
                           5: "May",
                           6: "Jun",
                           7: "Jul",
                           8: "Aug",
                           9: "Sep",
                           10: "Oct",
                           11: "Nov",
                           12: "Dez"}
        self.filename = "{month}_{query}_2020.yaml".format(month=self.month_dict[self.start.month], query=query)

    def collect(self, month = 1):

        # you can call this function with any month
        if self.start.month != month:
            self.start = datetime(2020,month,1,0)
            self.end = datetime(2020, month, 1, 1)
            self.filename = "{month}_{query}_2020.yaml".format(month=self.month_dict[month], query=self.query)

        if os.path.isfile(self.filename):
            with open(self.filename, 'r') as file:
                data = yaml.load(file, Loader=yaml.FullLoader)
            lastRecordedDay = list(data.keys())[-1]
            self.start = datetime(2020, month, lastRecordedDay+1, 0)
            self.end = datetime(2020, month, lastRecordedDay+1, 1)
            print("Assest New Start at {0}".format(self.start))
            del data
            del lastRecordedDay

        # create tmp store dict
        day_dict = {}

        # and itterate over every hour of the month
        # yikes
        while self.end.month == month:

            # convert time to timestamps
            s = int(self.start.timestamp())
            e = int(self.end.timestamp())
            rand1 = np.random.randint(1,5)
            rand2 = np.random.randint(1,5)
            time.sleep(rand1)
            # run query and collect timing
            start_time = time.perf_counter()
            data = list(self.api.search_submissions(after=s,
                                   before=e,
                                   filter=['subreddit', 'selftext', 'title'],
                                   q=self.query))
            end_time = time.perf_counter()
            time.sleep(rand2)
            # calculate time difference
            diff_time = (end_time - start_time)/ 60
            print("Found {entry} submissions with {query} in hour {hour} of day {day}. It took me {time} minutes".format(entry=len(data), query=self.query, hour=self.start.hour, day = self.start.day, time = diff_time.__round__(2)))

            # process the data, so that they are stored in a dict
            self.process(data, day_dict)
            # increase the hour count so that we move forward
            self.start = self.start + timedelta(hours=1)
            self.end = self.end + timedelta(hours=1)

            # one full day is done
            # and we save it in a yaml file
            if self.start.day != self.end.day:
                print("Collected {day}".format(day=self.start.day))
                self.store_dict[self.start.day] = copy.deepcopy(day_dict)
                self.store_data()
                day_dict.clear()


    def process(self, data, tmp_dict):
        for entry in data:
            # extract the entry dict
            entry_dict = entry[-2]
            # some error occurred here so we check if all keys are present
            # if ['subreddit', 'selftext', 'title'] in list(entry_dict.keys()):

                # check if the subredit already exists
            if entry_dict['subreddit'] in tmp_dict.keys():

                # we are counting every occurance, eventhough there are some doubled in one entry
                tmp_dict[entry_dict['subreddit']]['count'] += 1
                # but we only want to store the title once
                if entry_dict['title'] not in tmp_dict[entry_dict['subreddit']]['title']:
                    tmp_dict[entry_dict['subreddit']]['title'].append(entry_dict['title'])
                    try:
                        tmp_dict[entry_dict['subreddit']]['text'].append(entry_dict['selftext'])
                    except:
                        tmp_dict[entry_dict['subreddit']]['text'].append("Missing")
            else:
                tmp_dict[entry_dict['subreddit']] = {}
                tmp_dict[entry_dict['subreddit']]['count'] = 1
                tmp_dict[entry_dict['subreddit']]['title'] = [entry_dict['title']]
                try:
                    tmp_dict[entry_dict['subreddit']]['text'] = [entry_dict['selftext']]
                except:
                    tmp_dict[entry_dict['subreddit']]['text'] = ["Missing"]


    def store_data(self):
        if not os.path.isfile(self.filename):
            with open(self.filename, 'w') as file:
                yaml.dump(self.store_dict, file)
        else:
            with open(self.filename, 'r') as file:
                data = yaml.load(file, Loader=yaml.FullLoader)
                data.update(self.store_dict)
            with open(self.filename, 'w') as file:
                yaml.dump(data, file)
        self.store_dict.clear()
        print("stored data in {doc}".format(doc=self.filename))

In [3]:
def print_hi():
    # Use a breakpoint in the code line below to debug your script.
    col = Collector()
    col.collect(month=11)

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print_hi() 

# See PyCharm help at https://www.jetbrains.com/help/pycharm/


Found 1953 submissions with AI in hour 0 of day 1. It took me 3.91 minutes
Found 1893 submissions with AI in hour 1 of day 1. It took me 4.71 minutes
Found 1330 submissions with AI in hour 2 of day 1. It took me 3.5 minutes
Found 1425 submissions with AI in hour 3 of day 1. It took me 2.6 minutes
Found 1731 submissions with AI in hour 4 of day 1. It took me 4.53 minutes
Found 1153 submissions with AI in hour 5 of day 1. It took me 2.77 minutes
Found 780 submissions with AI in hour 6 of day 1. It took me 1.91 minutes
Found 1941 submissions with AI in hour 7 of day 1. It took me 3.81 minutes
Found 920 submissions with AI in hour 8 of day 1. It took me 2.41 minutes
Found 1055 submissions with AI in hour 9 of day 1. It took me 2.68 minutes
Found 1272 submissions with AI in hour 10 of day 1. It took me 2.55 minutes
Found 1426 submissions with AI in hour 11 of day 1. It took me 3.91 minutes
Found 1216 submissions with AI in hour 12 of day 1. It took me 3.32 minutes
Found 1612 submissions wit