## Pre-requirements

In [45]:
# # obtain required packages
# !pip install python-dotenv
# !wget -P . https://files.pythonhosted.org/packages/cc/7a/240ac474f69a58a5dda0513557fe9c2ada279f7d5d836b93d5f6e05a9aa7/Mastodon.py-1.8.1.tar.gz 

In [46]:
# # install Mastodon library for conda env
# conda install Mastodon.py-1.8.1.tar.gz

In [47]:
# # (OR try th following) install for conda env
# conda install -c conda-forge python-dotenv
# conda install -c conda-forge mastodon.py

## File to record the data

In [48]:
DATA_FILE = 'Mastodon_social_3.json'

## Check the pre-requirements are satisfied

In [49]:
# Import required packages
from dotenv import load_dotenv
from mastodon import Mastodon
import os

from bs4 import BeautifulSoup as bs4
import requests

# Load variables from .env file
load_dotenv()

True

## Prepare for the Mastodon Harvester

In [50]:
# obtain access token from Mastodon
ACCESS_TOKEN = os.environ.get("MASTODON_ACCESS_TOKEN")

# define base URL
BASE_URL = "https://mastodon.social"

In [51]:
# create Mastodon client object
mastodon = Mastodon(api_base_url=BASE_URL, access_token=ACCESS_TOKEN)
mastodon.retrieve_mastodon_version()

'4.1.2'

In [52]:
# sample scraper: use mastodon api to obtain content from a post, then parse it with BeautifulSoup
bs4(mastodon.status("110284900191177710")["content"], 'html.parser').text

'A history of the top marginal tax rates on the wealthiest Americans: 1940: 81%1950: 84% 1960: 91% 1970: 72% 1980: 70% 1990: 28% 2000: 40% 2010: 35% For 50 years, corporate backed politicians in Congress have slashed taxes to line the pockets of their wealthy donors.'

In [53]:
# Import required packages
from mastodon import Mastodon, MastodonNotFoundError, MastodonRatelimitError, StreamListener
import csv, os, time, json

# defien Mastodon client object
m = Mastodon(
        api_base_url=BASE_URL,
        access_token=ACCESS_TOKEN
    )

# Define a Mastodon Listener class
# This listener class will print out the JSON of any updates that come through the stream
# Ti will be an infinite loop, so you will need to stop the program or kernel to stop it
class MyListener(StreamListener):
    def on_update(self, status):
        # This function will be called whenever a new public message is received
        # status is a dictionary containing the message data
        with open(DATA_FILE, 'a', encoding='utf-8') as f:
            # Append the new message to the JSON file
            json.dump(status, f, indent=2, sort_keys=True, default=str)
            f.write(',\n')



## Start (or Continue) harvesting

In [54]:
if not os.path.exists(DATA_FILE):
    # start the list if NO harvesting
    with open(DATA_FILE, 'a', encoding='utf-8') as f:
        f.write('[\n')  
else:
  # replace the last "\n]" with ",\n" to keep going
  with open(DATA_FILE, 'rb+') as f:
    f.seek(-3, os.SEEK_END)
    f.truncate()

  with open(DATA_FILE, 'a') as f:
    f.write(',\n')

listener = MyListener()
m.stream_public(listener)

KeyboardInterrupt: 

## End harvesting

In [55]:
# replace the last ",\n" with "\n]" to end the list
with open(DATA_FILE, 'rb+') as f:
  f.seek(-3, os.SEEK_END)
  f.truncate()

with open(DATA_FILE, 'a') as f:
  f.write('\n]')



## Check the recorded file is in correct format: i.e., the file contains a list of json objects

In [56]:
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    file = json.load(f)
    print(len(file))

572
