In [1]:
from mastodon import Mastodon
import os
from dotenv import load_dotenv
from datetime import datetime
import pytz
from bs4 import BeautifulSoup

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('punkt')

load_dotenv()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/huangsunchuangyu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/huangsunchuangyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/huangsunchuangyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
class MastodonInfo:
    def __init__(self, server):
        self.server_name = server
        self.client_id=os.environ.get(f"MASTODON_{server}_CLIENT_KEY")
        self.client_secret=os.environ.get(f"MASTODON_{server}_CLIENT_SECRET")
        self.access_token=os.environ.get(f"MASTODON_{server}_ACCESS_TOKEN")
        self.api_base_url=os.environ.get(f"MASTODON_{server}_URL")
        
    def __repr__(self):
        return f"Mastodon server: {self.server_name}"
    

In [42]:
MASTODON_SERVERS = ["SOCIAL", "AU", "TICTOC_SOCIAL", 
                    "NEURODIVERSITY", "THEBLOWER", "AUS_SOCIAL"]
MASTODON_SERVERS = [MastodonInfo(server) for server in MASTODON_SERVERS]

In [43]:
MASTODON_SERVERS[-1]

Mastodon server: AUS_SOCIAL

In [115]:
mastodon = Mastodon(
    client_id=MASTODON_SERVERS[-2].client_id,
    client_secret=MASTODON_SERVERS[-2].client_secret,
    access_token=MASTODON_SERVERS[-2].access_token,
    api_base_url=MASTODON_SERVERS[-2].api_base_url
)

In [122]:
def binary_search_date(since_date, until_date, lower_id, upper_id, mastodon):
    melbourne_tz = pytz.timezone('Australia/Melbourne')
    
    since_date = since_date.replace(tzinfo=pytz.UTC).astimezone(melbourne_tz)
    until_date = until_date.replace(tzinfo=pytz.UTC).astimezone(melbourne_tz)

    id_changes = []

    while True:
        lower_toot = mastodon.timeline_public(min_id=lower_id, limit=1)[0]
        upper_toot = mastodon.timeline_public(max_id=upper_id, limit=1)[0]

        lower_created_at = datetime.fromisoformat(str(lower_toot['created_at'])).replace(tzinfo=pytz.UTC)
        upper_created_at = datetime.fromisoformat(str(upper_toot['created_at'])).replace(tzinfo=pytz.UTC)
        lower_created_at = lower_created_at.astimezone(melbourne_tz)
        upper_created_at = upper_created_at.astimezone(melbourne_tz)

        if lower_created_at.date() == since_date.date() and upper_created_at.date() == until_date.date():
            break

        mid_id = (upper_id + lower_id) // 2
        toot = mastodon.timeline_public(max_id=mid_id, limit=1)[0]
        created_at = datetime.fromisoformat(str(toot['created_at'])).replace(tzinfo=pytz.UTC)
        created_at = created_at.astimezone(melbourne_tz)

        if created_at.date() == since_date.date():
            break

        id_changes.append((lower_id, upper_id))

        if len(id_changes) >= 6:
            last_five_id_changes = id_changes[-5:]
            if all(x == last_five_id_changes[0] for x in last_five_id_changes):
                break

        if created_at < since_date:
            lower_id = toot['id']
        else:
            upper_id = toot['id']

    print()  # To move to the next line in the terminal after the search is completed
    return lower_id


In [123]:
since_date = datetime(2021, 12, 30)  # Fetch data since January 1, 2020
until_date = datetime(2022, 1, 1)  # Fetch data until Januray 1, 2020

In [124]:
lower_id = 107476658235995721   
upper_id = 107564295751148351 

In [125]:
response = mastodon.timeline_public(min_id='107476658235995721', limit=1)
response

[{'id': 107533255740844529,
  'created_at': datetime.datetime(2021, 12, 30, 1, 21, 6, tzinfo=tzutc()),
  'in_reply_to_id': None,
  'in_reply_to_account_id': None,
  'sensitive': False,
  'spoiler_text': '',
  'visibility': 'public',
  'language': 'fr',
  'uri': 'https://mastodon.social/users/Callystor/statuses/107533255733238174',
  'url': 'https://mastodon.social/@Callystor/107533255733238174',
  'replies_count': 0,
  'reblogs_count': 0,
  'favourites_count': 0,
  'edited_at': None,
  'favourited': False,
  'reblogged': False,
  'muted': False,
  'bookmarked': False,
  'content': "<p>Juste pour le plaisir, ce démontage en règle de l'escroquerie néolibérale est à voir et revoir, spécialement en période électorale...<br>🙏 @caissesdegreve@twitter.com</p>",
  'filtered': [],
  'reblog': None,
  'account': {'id': 109425539629584732,
   'username': 'Callystor',
   'acct': 'Callystor@mastodon.social',
   'display_name': '🌿🐝 Реми φ🍒🐢',
   'locked': False,
   'bot': False,
   'discoverable': F

In [126]:
binary_search_date(since_date, until_date, lower_id, upper_id, mastodon)

107476658235995721 107564295751148351


107476658235995721

In [None]:
MASTODON_SERVERS_INFO[0].api_base_url

In [None]:
mastodon = Mastodon(
    client_id=MASTODON_SERVERS_INFO[0].client_id,
    client_secret=MASTODON_SERVERS_INFO[0].client_secret,
    access_token=MASTODON_SERVERS_INFO[0].access_token,
    api_base_url=MASTODON_SERVERS_INFO[0].api_base_url
)

In [84]:
mastodon.retrieve_mastodon_version()

'4.1.0'

In [14]:
response = mastodon.timeline_public(min_id=103388556664584957, limit=40)

In [15]:
def normalize_string(input_string):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(input_string)
    normalized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    normalized_string = ' '.join(normalized_tokens)
    return normalized_string

def sentiment_analysis(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    compound_score = sentiment['compound']
    
    if compound_score <= -0.8:
        return 1
    elif -0.8 < compound_score <= -0.6:
        return 2
    elif -0.6 < compound_score <= -0.4:
        return 3
    elif -0.4 < compound_score <= -0.2:
        return 4
    elif -0.2 < compound_score < 0.2:
        return 5
    elif 0.2 <= compound_score < 0.4:
        return 6
    elif 0.4 <= compound_score < 0.6:
        return 7
    elif 0.6 <= compound_score < 0.8:
        return 8
    else:
        return 9

In [17]:
class Toot:
    def __init__(self, tid=None, date=None, author=None, 
                 lang=None, content=None, score=None, tags=None):
        self.tid = tid
        self.date = date
        self.author = author
        self.lang = lang
        self.content = content
        self.score = score
        self.tags = tags
        
    def __repr__(self):
        return f"Toot {self.tid}"
        
    def to_dict(self):
        data = dict(tid=self.tid, date=self.date,
                    author=self.author, lang=self.lang,
                    content=content.self.content, score=self.score)
        return data

def extract_mastodon_info(res):
    toot_id = res.id
    date = res.created_at.strftime("%Y-%m-%dT%H:%M:%S.000Z")
    lang = res.language
    content = normalize_string(BeautifulSoup(res.content, 'html.parser').text)
    score = sentiment_analysis(content)
    tags = "|".join(res.tags)
    
    return Toot(tid=toot_id, date=date, lang=lang, content=content, score=score, tags=tags)

extract_mastodon_info(response[-1]).tags

''

In [78]:
response[-1]

{'id': 103388556698881147,
 'created_at': datetime.datetime(2019, 12, 29, 1, 49, 36, tzinfo=tzutc()),
 'in_reply_to_id': None,
 'in_reply_to_account_id': None,
 'sensitive': False,
 'spoiler_text': '',
 'visibility': 'public',
 'language': 'ja',
 'uri': 'https://vocalodon.net/users/hanapo/statuses/103388556591121713',
 'url': 'https://vocalodon.net/@hanapo/103388556591121713',
 'replies_count': 0,
 'reblogs_count': 0,
 'favourites_count': 0,
 'edited_at': None,
 'favourited': False,
 'reblogged': False,
 'muted': False,
 'bookmarked': False,
 'content': '<p>紙のしかなかったのでそれ買ってきた</p>',
 'filtered': [],
 'reblog': None,
 'account': {'id': 103157,
  'username': 'hanapo',
  'acct': 'hanapo@vocalodon.net',
  'display_name': 'はなぽ🍊わんわんPの皮',
  'locked': False,
  'bot': False,
  'discoverable': True,
  'group': False,
  'created_at': datetime.datetime(2017, 4, 19, 0, 0, tzinfo=tzutc()),
  'note': '<p>ニコニコ投稿曲：<a href="https://nico.ms/mylist/31873462" rel="nofollow noopener noreferrer" target="_blank

In [72]:
len(mastodon.timeline_public(min_id=103388556698881147, limit=40))

40

In [55]:
binary_search_date(since_date, until_date, 
                   lower_id, upper_id, mastodon)

IndexError: list index out of range