In [9]:
import requests

import altair as alt
import pandas as pd

from sqlalchemy import func
from sqlmodel import Session, select
from tqdm import tqdm

from store_messages import Message, Link, activate_db


In [10]:
engine = activate_db('messages.db')
with Session(engine) as session:
    query = select(func.count(Message.id))
    number_of_messages = session.exec(query)
    print(next(number_of_messages))

    query = select(Message.year, Message.month, func.count(Message.id)).group_by(Message.year, Message.month)
    messages_grouped_by_year_month = session.exec(query)
    grouped_data = pd.DataFrame(messages_grouped_by_year_month.all(), columns=("year", "month","count"))
    grouped_data.insert(0, 'year_month', grouped_data.apply(lambda x: f'{x.year}{x.month:02}', axis=1))
    freq_plot = alt.Chart(grouped_data).mark_bar().encode(x='year_month', y='count')
freq_plot

657577


In [11]:
link_shorteners = ('aka.ms', 'bit.ly', 'goo.gl', 'j.mp', 't.co', 't.me', 't.ly')
with Session(engine) as session:
    links = session.exec(select(Link.link))
    link_count = 0
    link_set = set()
    link_prefixes = set()
    for link in links:
        link_set.add(link)
        link = link.replace('https://','').replace('http://', '').lower()
        link_parts = link.split('/')
        if len(link_parts[0]) < 7:
            link_prefixes.add(link_parts[0])
        link_count += 1
    print(link_count, len(link_set))
    print(sorted(link_prefixes))

586760 45367
['45.ooo', 'abr.to', 'acc.no', 'aka.ms', 'au.int', 'bbc.in', 'bit.do', 'bit.ly', 'boe.es', 'ch7.io', 'chd.tv', 'clc.am', 'cnb.cx', 'cnn.it', 'co.za', 'ctt.ac', 'ctt.ec', 'dld.bz', 'dtv.to', 'e.tv', 'ept.ms', 'etc.as', 'ew.com', 'exc.to', 'fb.me', 'g.co', 'goo.gl', 'gov.uk', 'gov.za', 'https:', 'i.do', 'i.live', 'ibb.co', 'ift.tt', 'ifw.io', 'is.gd', 'it.is', 'ivm.to', 'j.mp', 'jmp.sh', 'kw.be', 'lc.org', 'm.me', 'mol.im', 'mr.pm', 'nj.com', 'nos.nl', 'now.be', 'opr.as', 'osf.io', 'ow.ly', 'qz.com', 'rb.gy', 'rki.de', 'rt.com', 'rtv.be', 'sc.mp', 'str.sg', 't.co', 't.com', 't.ly', 't.me', 'tkp.at', 'to.pin', 'uk.gov', 'un.org', 'up.as', 'us.gov', 'vac.uk', 'vb.me', 'vk.com', 'w-j.co', 'w.app', 'wa.me', 'we.tl', 'wef.ch', 'wp.me', 'x.com', 'xy2.eu', 'yle.fi', '👉t.me']


In [12]:
resolved_links: dict[str, str] = {}
unresolved_links = set()
for link in tqdm(link_set):
    original_link = link
    if not link.startswith('http'):
        link = 'http://' + link
    try:
        r = requests.head(link, timeout=5)
        if r.status_code == 200:
            resolved_links[original_link] = r.url
        else:
            unresolved_links.add(original_link)
    except IOError:
        unresolved_links.add(original_link)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 45367/45367 [27:38:26<00:00,  2.19s/it]


In [13]:
len(unresolved_links)

24752

In [17]:
count = 0

for (original, resolved) in resolved_links.items():
    if original == resolved:
        continue
    print(original, resolved)
    count = count + 1
    if count > 100:
        break

https://phmpt.org https://phmpt.org/
https://www.naturalnews.com/2021-06-02-horrifying-study-reveals-mrna-vaccine-nanoparticles-are-circulated-throughout-the-entire-body-brain-heart-liver-ovaries-testes.html# https://www.naturalnews.com/2021-06-02-horrifying-study-reveals-mrna-vaccine-nanoparticles-are-circulated-throughout-the-entire-body-brain-heart-liver-ovaries-testes.html
www.openforall.co.uk http://www.openforall.co.uk/
www.miric.co.za http://www.miric.co.za/
m.vegas7games.com http://m.vegas7games.com/
https://www.unitedaustraliaparty.org.au https://www.unitedaustraliaparty.org.au/
corona.tuply.co.za http://corona.tuply.co.za/
https://www.naturalnews.com/2022-02-12-one-way-covid-masks-fake-news-cnn.html# https://www.naturalnews.com/2022-02-12-one-way-covid-masks-fake-news-cnn.html
https://www.medicinenet.com https://www.medicinenet.com/
https://www.facebook.com/thomas.davies.54/posts/4134936786617020?__cft__[0]=AZVR6kUKwnT9CUBBRsMP9zPFhIxXoT25Rk02zS1G0XATmgj08J3tMxb2PTPtNCDYplOEW