In [9]:
from pyquery import PyQuery as pq
from tqdm import tqdm

import csv
import datetime
from glob import glob

In [10]:
rssdir = "rss/"
masterlist = []
for rssfile in tqdm(glob(rssdir + "*")):
    if ".aria2" not in rssfile:   # Skip stuff still downloading
        indexfile = rssfile.replace(rssdir, "").replace(rssdir.replace('/', '\\'), '')
        with open(rssfile, "rb") as infile:
            html = infile.read()
            for item in pq(html)("item"):
                line = {}
                line["rssindex"] = indexfile
                authors = []
                for au in pq(item)("dc\:creator"):
                    authors.append(pq(au).text().strip())
                line["authors"] = "|".join(authors)
                line['title'] = pq(pq(item)("title")).text().strip()
                line['introtext'] = pq(pq(pq(item)("description"))[0]).html()
                line['good-date'] = ""
                line['original-date'] = pq(pq(item).html().replace("pubDate", "pubdate"))("pubdate").text()
                # Tue, 25 Aug 2020 11:38:02 GMT
                line["good-date"] = datetime.datetime.strptime(line['original-date'], "%a, %d %b %Y %H:%M:%S GMT")
                line["good-date"] = datetime.datetime.strftime(line['good-date'], "%Y-%m-%d %H%M%S")    
                line["maybe-archive"] = "https://web.archive.org/web/*/"
                line["maybe-archive"] += pq(pq(item)("link")).text().replace("https://cms.", "").replace("https://www.", "")    
                line['original-link'] = pq(pq(item)("link")).text()
                categories = []
                for cat in pq(item)("category"):
                    categories.append(pq(cat).text())
                line['categories'] = '|'.join(categories)
                masterlist.append(line)

100%|████████████████████████████████████████████████████████████████████████████| 31148/31148 [15:44<00:00, 32.99it/s]


In [21]:
# So we need to split this damn thing up into a several files -- maybe 2020s, late 2010s, early 2010s, late 2000s.
sortedlist = sorted(masterlist, key=lambda d: d['good-date'], reverse=True)
yearholder = {}
for row in sortedlist:
    year = row['good-date'][:4]
    if year not in yearholder:
        yearholder[year] = []
    yearholder[year].append(row)

In [33]:
# Hahaha, no, the late 2010s are twice the size of everything else combined.

for year in yearholder:
    with open(f"vice-index-annual-{year}.csv", "w", newline="", encoding="utf-8") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(list(masterlist[0].keys()))    # Writer the header
        for row in yearholder[year]:
            writer.writerow(list(row.values()))

In [37]:
authorindex = {}
for year in yearholder:
    for row in yearholder[year]:
        for author in row['authors'].split("|"):
            if author not in authorindex:
                authorindex[author] = {}
            if year not in authorindex[author]:
                authorindex[author][year] = 0
            authorindex[author][year] += 1

In [38]:
with open("author-index.csv", "w", newline="", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["author", "year", "storycount"])
    for author in sorted(authorindex):
        for year in sorted(authorindex[author]):
            storycount = authorindex[author][year]
            writer.writerow([author, year, storycount])