In [7]:
import json
from tinydb import TinyDB, Query, where
import time
import html2text
import re
from tqdm import tqdm
from collections import Counter
import string

In [8]:
# coursemap contains course wildcards to actual course values 
# (e.g. cs189 gets mapped to {'CS 189', 'CS189', and '189'} keys with 'CS189' values)
courses = json.load(open('courses.json'))
coursemap = {}
for course in courses:
    actual = course[0:2] + ' ' + course[2:]
    coursemap[actual] = course
    coursemap[course] = course
    coursemap[course[2:]] = course

In [9]:
# extract professor last names
proflist = json.load(open('profs.json'))
profs = set()
profmap = {}
for prof in proflist:
    prof = prof.split(' ')[-1]
    profmap[prof] = prof

In [10]:
"""
To extract tags, we find without replacement (start with biggest-length string, then go smaller.)
"""
courses = list(reversed(sorted(coursemap.keys(), key=len)))
profs = list(reversed(sorted(profmap.keys(), key=len)))
replace_punc = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
parse_endorse = lambda data : [row.get('name') for row in data.get('tag_good', [])]
    
def extract_tags(content, categories, keymap):
    tags = set()
    content = content.lower().translate(replace_punc) + ' '
    
    for t in categories:
        if ' ' + t.lower() + ' ' in content:
            tags.add(keymap[t])
            content = content.replace(t, '')
    return tags

parse_faculty = lambda content : extract_tags(content, profs, profmap)
parse_courses = lambda content : extract_tags(content, courses, coursemap)

In [11]:
# Dump the metadata - we use it in various places.

json.dump({
    'coursemap': coursemap,
    'profmap': profmap,
    'courses': list(sorted(set(coursemap.values()))),
    'profs': list(sorted(set(profmap.values())))
}, open('metadata.json', 'w'))

In [12]:
class Child:
    
    def __init__(self):
        pass
      
    @staticmethod
    def parse_children(children):
        results = []
        for row in children:
            model = Child()
            model.type = row.get('type')
            model.upvotes = len(parse_endorse(row))
            if 'subject' in row.keys():
                model.content = row.get('subject')
            else:
                model.content = row.get('history')[0].get('content')
                model.created = row.get('history')[0].get('created')
            
            model.date = row.get('created')
            model.faculty = parse_faculty(model.content)
            model.courses = parse_courses(model.content)
            model.author = 'Instructor' if row.get('type') == 'i_answer' else 'Student'
                
            if row.get('children') != None:
                model.children = Child.parse_children(row.get('children'))
                for f in model.children:
                    model.faculty.update(f.faculty)
                    model.courses.update(f.courses)
                    
            model.searchContent = model.content + ' '.join([child.searchContent for child in model.children])
            results.append(model)
        return results
    
    def to_json(self):
        return {
            'content': self.content,
            'children': [child.to_json() for child in self.children],
            'upvotes': self.upvotes,
            'author': self.author,
            'date': self.date
        }

In [13]:
class Post:
    
    def __init__(self, data):
        self.tags = data.get('tags')
        self.id = data.get('nr')
        self.author = 'Anonymous'
        self.date = data.get('created')
        self.title = data.get('history')[0].get('subject')
        self.content = data.get('history')[0].get('content')
        self.children = Child.parse_children(data.get('children', []))
        self.faculty = parse_faculty(self.content + self.title)
        self.courses = parse_courses(self.title)
        self.upvotes = len(parse_endorse(data))

    def to_json(self):
        return {
            'id': self.id,
            'tags': list(self.tags),
            'date': self.date,
            'title': self.title,
            'content': self.content,
            'children': [child.to_json() for child in self.children],
            'professors': list(self.faculty),
            'courses': list(self.courses),
            'upvotes': self.upvotes,
            'searchContent': self.title + ' ' + self.content + ' '.join([child.searchContent for child in self.children])
        }

In [14]:
network_id = "hyq0br1u3kx7dg"

In [15]:
db = TinyDB(f'exports/{network_id}.json')

In [16]:
entries = []
for row in tqdm(db.all()):
    entry = Post(row)
    entries.append(entry)

100%|██████████| 5504/5504 [00:15<00:00, 345.31it/s]


In [44]:
export = []
for entry in entries:
    export.append(entry.to_json())
json.dump(export, open('dump.json', 'w'))