## Imports

In [1]:
import mwapi
import mwparserfromhell as parser
import re
import requests
import urllib.parse
import json

## Helper functions and constants

In [46]:
# == urls ==
wikidata_api_url = "https://www.wikidata.org/w/api.php"
wikidata_rest_url = "https://www.wikidata.org/w/rest.php"
wdqs_url = "query.wikidata.org/"
link_shortener_url = "w.wiki/"

# == regexes ==
# pattern to identify the end of a comment
# assumption is that most comments will end one of four ways, and that those ends are likely to be unique:
# - "[[User:xxxx|yyyy]]<other text>hh:mm, DD month YYYY (UTC)"
# - "[[User talk:xxxx|yyyy]]<other text>hh:mm, DD month YYYY (UTC)"
# - "{{Unsigned}}"
# - "{{Unsigned|other text}}"
#
# Other patterns may be added as time goes on
comment_end_regex = re.compile(
    # Get strings that contain [[User:xxxx]] or [[User talk:xxxx]] + WP-formatted datetime string
    '\[\[User(?: talk)?:([^|\]]+)\|.+?\]\]\s*.*?(\d{2}:\d{2}i?, \d{1,2} \w+ \d{4} \(UTC\))\s*'
    #    ^^^^^^^^^^^^    ^^^^^^^^^^^^^    ^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    #   [User( talk):]   [User string][other text] [datetime like hh:mm, DD month YYYY (UTC)] OR

    # Get unsigned comment templates
    '|(\{\{Unsigned[^\}]*\}\})\s*',
    # ^^^^^^^^^^^^ ^^^^^
    # OR [template][other text]
    re.IGNORECASE
)

# patterns to identify QIDs/PIDs that appear in a comment
# these appear in one of three ways:
# - "P/Q####"
# - "{{P/Q|####}}" (template format 1)
# - "{{P/Q|P/Q####}}" (template format 2)
#
# in the case of the templates, we want to extract the entire template and the number
# in the case of the raw QIDs/PIDs, we want to extract the whole ID
qid_regex = re.compile('(\{\{Q\|Q?(\d+)\}\})|(Q\d+)', re.IGNORECASE)
pid_regex = re.compile('(\{\{P\|P?(\d+)\}\})|(P\d+)', re.IGNORECASE)

# pattern to identify the number of leading colons or bullet points
# assumption is that this indicates the *depth* of a comment in a conversation
depth_regex = re.compile('^:+|^\*+')

# pattern to identify sparql codeblocks added in the parsing process
# codeblocks are of the form ```sparql<query>```
code_block_regex = re.compile('```sparql.*?```', re.DOTALL)

# regex for parsing queries
# for splitting apart sparql templates
query_regex = re.compile('query\s*=\s*')

# request a query archive page title regex
# pulls out year and month
title_regex = re.compile('Wikidata:Request a query/Archive/(\d+)/(\d+)')

# regex for extracting the text from wikitext-formatted titles
heading_regex = re.compile('=+\s*([^=]+)\s*=+')

# optimization regexes (based on https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization)
inverse_property_path_regex = re.compile('^(\?\w+)\s+wdt:P31\/wdt:P279\*\s+([^\s]+)\s*.$', re.IGNORECASE)
count_regex = re.compile('COUNT\(\?\w+\)', re.IGNORECASE)

# mwapi regex
mwapi_regex = re.compile('SERVICE\s+wikibase:mwapi\s+\{\s+[^\}]+\}', re.IGNORECASE | re.DOTALL)

# == string patterns ==
# patterns to identify strings of the form "{{!}}" or "{{!!}}", still sometimes used in wikitext
bang_1_pattern = '{{!}}'
bang_2_pattern = '{{!!}}'

# patterns for skipping links
link_skip = [
    f'[https://{wdqs_url} ',
    f'[http://{wdqs_url} ',
    f' https://{wdqs_url}]',
    f' http://{wdqs_url}]',
    'bigdata/ldf'
]

In [47]:
# get all the pages that have a certain prefix in a given namespace
def get_pages_with_prefix(prefix, namespace=None):
    # set params
    params = {
        "action": "query",
        "format": "json",
        "list": "allpages",
        "apnamespace": namespace,
        "apprefix": prefix,
        "aplimit": "max"
    }
    
    # try to get pages with that prefix
    try:
        response = requests.get(wikidata_api_url, params=params)
        response.raise_for_status()  # Check for HTTP errors
        data = response.json()
    except Exception as e:
        print("Error:", e)
        print("Response content:", response.text)
        return []

    # if data is correctly formatted, return
    if 'query' in data and 'allpages' in data['query']:
        pages = data['query']['allpages']
        return pages
    else:
        print("Unexpected response format:", data)
        return []

# helper function for parsing external wikilinks
# external wikilinks are usually formatted like: [https://example.com some extra text]
def parse_external_wikilinks(link):
    # split on first space
    split = link.split(' ', 1)
    
    # if split successful (there's extra text)
    if len(split) == 2:
        # drop leading '[' and trailing ']' and set url and text
        url = split[0][1:]
        text = split[1][:-1]
    
    # otherwise no extra text
    else:
        # drop brackets and set text to ''
        url = split[0][1:-1]
        text = ''

    # if this is a wdqs url, get the portion after the '#' or the url param 'query='
    # also make sure to html decode it using urllib.parse.unquote()
    if wdqs_url in url:
        try:
            query = urllib.parse.unquote(url.split(wdqs_url)[1].split('#', 1)[1])
        except:
            query = urllib.parse.unquote(url.split(wdqs_url)[1].split('query=', 1)[1])
        
    # if this is a link shortener url, get the unshortened link and try to parse it as a wdqs url
    elif link_shortener_url in url:
        query = ''
        response = requests.get(url, allow_redirects=True)
        if wdqs_url in response.url:
            query = urllib.parse.unquote(response.url.split(wdqs_url)[1].split('#', 1)[1])

    return text, query

# helper functions for labeling QIDs with english labels
def get_qid_titles(qid_set):
    qids = {}
    for qid in qid_set:
        response = requests.get(f'{wikidata_rest_url}/wikibase/v0/entities/items/{qid}?_fields=labels')
        data = response.json()
        try:
            qids[qid] = data['labels']['en']
        except:
            qids[qid] = ''
    return qids

# helper functions for labeling PIDs with english labels
def get_pid_titles(pid_set):
    pids = {}
    for pid in pid_set:
        response = requests.get(f'{wikidata_rest_url}/wikibase/v0/entities/properties/{pid}?_fields=labels')
        data = response.json()
        try:
            pids[pid] = data['labels']['en']
        except:
            pids[pid] = ''
    return pids

# get the split points of a conversation based on comment end regex
def get_split_points(convo):
    # find all matches to the comment end regex
    match = re.findall(comment_end_regex, convo['full_text'])

    split_points = []
    for m in match:
        # case signed edit
        if m[0] != '':
            split_points.append(convo['full_text'].find(m[1])+len(m[1]))
        # case unsigned edit
        else:
            split_points.append(convo['full_text'].find(m[2])+len(m[2]))
    return split_points

# find and cleanly format all sparql queries in a comment
# then replace the templates/wikilinks with those cleanly formatted queries
def get_sparql(comment):
    queries = []
    
    # parse comment and get templates + links
    wikitext = parser.parse(comment)
    templates = wikitext.filter_templates()
    links = wikitext.filter_external_links()
    
    # for all templates
    for t in templates:
        # if it's a sparql template with the 'query' parameter
        if 'sparql' in t.name.lower() and t.has_param('query'):
            
            # get the query, replace bang patterns, and append it to the query list
            query = re.split(query_regex, str(t['query']))[1]
            query = query.replace(bang_1_pattern, "|").replace(bang_2_pattern, "||")
            queries.append(query)
            
            # format into an md codeblock and replace template with codeblock
            clean_sparql = f"\n```sparql\n{query}\n```\n"
            comment = comment.replace(str(t), clean_sparql)

    # for all links
    for l in links:
        # if it's a wdqs or link shortened link
        if wdqs_url in l or link_shortener_url in l:
            
            # skip if it isn't wikilink-formatted
            if l[0] != '[' or l[-1] != ']':
                continue
            
            # skip if it matches any link skip string
            if any(skip_str in l for skip_str in link_skip):
                continue
                        
            # otherwise try to:
            try:
                # parse the link + text, replace bang patterns, and append to query list
                text, query = parse_external_wikilinks(l)
                query = query.replace(bang_1_pattern, "|").replace(bang_2_pattern, "||")
                queries.append(query)
                
                # format it into an md codeblock and replace template with codeblock
                clean_sparql = f"{text}\n```sparql\n{query}\n```\n"
                comment = comment.replace(str(l), clean_sparql)
            except:
                print(comment, l)
                continue
    
    # return the newly-formatted comment + the list of queries
    return comment, queries

def try_to_optimize_query(query):
    # inverse property path
    matches = re.findall(inverse_property_path_regex, query)
    if len(matches) > 0:
        for m in matches:
            subst = f'{m[1]} ^wdt:P279*/^wdt:P31 {m[0]} .'
            query = re.sub(inverse_property_path_regex, subst, query, count=1)

    # count
    matches = re.findall(count_regex, query)
    if len(matches) > 0:
        for m in matches:
            subst = 'COUNT(*)'
            query = re.sub(count_regex, subst, query, count=1)
    
    return query

## Data structure-related things
This section is for the creation of conversation tree structure based on linked comments

In [63]:
# class for an individual comment
# note: title for a discussion (enclosed in '='s) is considered to be the root comment
# only root comments get the argument last_sparql_query, which is a heuristic for the "answer" query
class Comment:
    def __init__(
        self,
        comment_id,
        text,
        depth,
        parent_id=None,
        preceding_id=None,
        sparql_queries=None,
        sparql_queries_clean=None,
        qids=None,
        pids=None,
        last_sparql_query=None,
        last_sparql_query_clean=None,
        last_sparql_query_optimized=None,
        url=None
    ):
        self.comment_id = comment_id
        self.text = text
        self.depth = depth
        self.parent_id = parent_id
        self.preceding_id = preceding_id
        self.sparql_queries = sparql_queries
        self.sparql_queries_clean = sparql_queries_clean
        self.qids = qids
        self.pids = pids
        self.children = []
        self.last_sparql_query = last_sparql_query
        self.last_sparql_query_clean = last_sparql_query_clean
        self.last_sparql_query_optimized = last_sparql_query_optimized
        self.url=url

    def add_child(self, child):
        self.children.append(child)
    
    def __str__(self):
        return f"""
        Comment ID: {self.comment_id}
        Parent ID: {self.parent_id}
        Preceding ID: {self.preceding_id}
        Depth: {self.depth}
        Text: {self.text}
        SparQL queries: {self.sparql_queries}
        QIDs: {self.qids}
        PIDS: {self.pids}
        URL: {self.url}
        """

# this function takes in a piece of unstructured text, does a bunch of
#  processing on it, and returns it as a new comment
def create_new_comment(comment_id, url, text, pid_meta_set, root=False):
    # get clean text + all sparql queries from this comment
    comment, sparql_queries = get_sparql(text)

    # create sets to get unique PIDs and QIDs
    qid_set = set()
    pid_set = set()

    # find all QIDs/PIDs from this comment using the associated regexes
    raw_qids = re.findall(qid_regex, comment)
    raw_pids = re.findall(pid_regex, comment)

    # Find code blocks and replace their content with placeholders to exclude them from replacement
    code_blocks = re.findall(code_block_regex, comment)
    code_block_placeholders = []
    for block in code_blocks:
        placeholder = f'__CODE_BLOCK_{len(code_block_placeholders)}__'
        comment = comment.replace(block, placeholder)
        code_block_placeholders.append(placeholder)

    # for each of the QIDs found
    for rq in raw_qids:
        # if it's a template add it to the set
        if rq[0] != '':
            qid_set.add(f'Q{rq[1]}')
        # if it's not a template add it to the set
        elif rq[2] != '':
            qid_set.add(rq[2].upper())

    # using the set, get all the QID titles
    qids = get_qid_titles(qid_set)

    sparql_queries_clean = sparql_queries

    # for each template/QID that appears in the text, replace it with `<title> (<qid>)`
    for rq in raw_qids:
        if rq[0] != '':
            qid = f"Q{rq[1]}"
            comment = comment.replace(rq[0], f"`{qids[qid]} ({qid})`")
        elif rq[2] != '':
            qid = rq[2].upper()
            comment = comment.replace(qid, f"`{qids[qid]} ({qid})`")
        
        # create cleaned sparql queries by replacing qid with [title]
        sparql_queries_clean = [q.replace(qid, f'[{qids[qid]}]') for q in sparql_queries_clean]

    # do the exact same thing for pids
    for rp in raw_pids:
        if rp[0] != '':
            pid_set.add(f'P{rp[1]}')
        elif rp[2] != '':
            pid_set.add(rp[2].upper())
    pid_meta_set.update(pid_set)
    pids = get_pid_titles(pid_set)
    for rp in raw_pids:
        if rp[0] != '':
            pid = f"P{rp[1]}"
            comment = comment.replace(rp[0], f"`{pids[pid]} ({pid})`")
        elif rp[2] != '':
            pid = rp[2].upper()
            comment = comment.replace(pid, f"`{pids[pid]} ({pid})`")

        # create cleaned sparql queries by replacing pid with [title]
        sparql_queries_clean = [q.replace(pid, f'[{pids[pid]}]') for q in sparql_queries_clean]
            
    # Restore code blocks
    for placeholder, block in zip(code_block_placeholders, code_blocks):
        comment = comment.replace(placeholder, block)

    # get depth (based on the number of leading colons or stars)
    # initial depth is 1 (bc root comment has a depth of 0)
    depth = 1
    depth_match = re.match(depth_regex, text)
    if depth_match:
        depth += len(depth_match.group(0))
        
    # if this is the root, set the depth to 0
    if root:
        depth = 0

    # return a new comment, filling in all the details we've gotten so far
    return Comment(
        comment_id=comment_id, 
        text=comment, 
        depth=depth,
        sparql_queries=sparql_queries,
        sparql_queries_clean=sparql_queries_clean,
        qids=qids,
        pids=pids,
        url=url
    )
    
# given a conversation and a set of split points, this function builds a linked comment tree
# it also updates the meta-set of PIDs (to calculate PID coverage)
def build_comment_tree(convo, split_points, url, pid_meta_set):
    # Create a root comment with default depth 0
    root = create_new_comment(
        comment_id=0,
        url=url,
        text=convo['heading'],
        pid_meta_set=pid_meta_set,
        root=True
    )

    comment_id = 1  # start with ID 1 for the first comment
    current_comment = root  # start with the root comment
    parent_stack = [root] # initialize stack to have the root node
    last_sparql_query = None # initialize last_sparql_query
    last_sparql_query_clean = None
    
    i = 0 # initial string start point is 0
    for j in split_points:
        text = convo['full_text'][i:j].strip() # extract text and user from split point
        # create new comment from that text
        new_comment = create_new_comment(
            comment_id=comment_id,
            url=url,
            text=text,
            pid_meta_set=pid_meta_set
        ) 
        
        # if the comment has sparql queries, update last_sparql_query
        if len(new_comment.sparql_queries) > 0:
            last_sparql_query = new_comment.sparql_queries[-1]
            last_sparql_query_clean = new_comment.sparql_queries_clean[-1]

        # if new comment depth > current comment depth, it's a child
        if new_comment.depth > current_comment.depth:
            # set new parent_id to current comment_id
            new_comment.parent_id = current_comment.comment_id
            # add it as a child
            current_comment.add_child(new_comment)
            # no preceding comment on this level
            new_comment.preceding_id = None
            # push current comment onto the stack as a parent
            parent_stack.append(current_comment)
            
        # elif new comment depth = current comment depth, it's a following comment
        elif new_comment.depth == current_comment.depth:
            
            # set new parent_id to be the same as the end of the parent stack
            new_comment.parent_id = parent_stack[-1].comment_id
            # add new comment as child to end of parent stack
            parent_stack[-1].add_child(new_comment)
            # current comment is the preceding comment on this level
            new_comment.preceding_id = current_comment.comment_id
            # no changes to parent stack
        
        # else new comment depth < current comment depth, continuation of a higher-level convo
        else:
            # pop off of the stack until the new comment depth > parent stack depth
            while new_comment.depth <= parent_stack[-1].depth:
                _ = parent_stack.pop()
                
            # new comment parent is the last comment on the stack
            new_comment.parent_id = parent_stack[-1].comment_id
            # new comment preceding is the last child of the parent 
            new_comment.preceding_id = parent_stack[-1].children[-1].comment_id
            # add new comment as a child of the parent stack
            parent_stack[-1].add_child(new_comment)
            
        # preceding_comment = new_comment # update preceding comment for next iteration
        current_comment = new_comment # move to new comment for next iteration
        comment_id += 1 # increment the comment ID for the next comment
        i = j # update split point for next comment

    if last_sparql_query is not None:
        last_sparql_query = re.sub(mwapi_regex, '', last_sparql_query) # replace mwapi calls in last_sparql_query
        root.last_sparql_query = last_sparql_query
        root.last_sparql_query_clean = re.sub(mwapi_regex, '', last_sparql_query_clean) # replace mwapi calls in last_sparql_query_clean
        last_sparql_query_optimized = try_to_optimize_query(last_sparql_query)
        if last_sparql_query_optimized != last_sparql_query:
            root.last_sparql_query_optimized = last_sparql_query_optimized
    return root

# Function to print the comment tree recursively
def print_comment_tree(comment, depth=0):
    print("  " * depth + str(comment))  # Print the current comment
    for child in comment.children:
        print_comment_tree(child, depth + 1)  # Print children recursively

# Function to recursively serialize the comment tree to JSON
def serialize_comment_tree(comment):

    def serialize_comment(comment):
        serialized_comment = {
            "comment_id": comment.comment_id,
            "parent_id": comment.parent_id,
            "preceding_id": comment.preceding_id,
            "text": comment.text,
            "depth": comment.depth,
            "sparql_queries": comment.sparql_queries,
            "sparql_queries_clean": comment.sparql_queries_clean,
            "qids": comment.qids,
            "pids": comment.pids,
            "url": comment.url,
            "last_sparql_query": comment.last_sparql_query,
            "last_sparql_query_clean": comment.last_sparql_query_clean,
            "last_sparql_query_optimized": comment.last_sparql_query_optimized,
            "children": [serialize_comment(child) for child in comment.children]
        }
        return serialized_comment
    
    return serialize_comment(comment)

## Parse a list of conversations from a request a query archive page
Archive of all human query requests made during the ongoing "Request a query" initiative.

Found at https://wikidata.org/wiki/Wikidata:Request_a_query/Archive

In [64]:
# given a page title, parse all of the conversations on that page
def parse_roq_convo(session, title, pid_meta_set):    
    # set source url
    source_url = f'https://www.wikidata.org/wiki/{title}'.replace(' ', '_')

    # get year and month strings for easy start/stop
    [year, month] = re.findall(title_regex, title)[0]
    month_str = f'{year}_{month}'
    print(month_str)
    
    # get page content, then parse it and get headings
    resp = session.get(
        formatversion=2,
        action='query',
        prop='revisions',
        rvslots='*',
        rvprop='content',
        titles=title
    )
    content = resp['query']['pages'][0]['revisions'][0]['slots']['main']['content']
    wikitext = parser.parse(content)
    headings = wikitext.filter_headings()

    convos = []

    # for each heading
    for i in range(len(headings)):
        # print(headings)
        # get heading text
        heading_text = re.findall(heading_regex, str(headings[i]))[0].strip()

        # get full text
        if i == len(headings) - 1:
            full_text = wikitext.split(str(headings[i]))[1]
        else:
            full_text = wikitext.split(str(headings[i]))[1].split(str(headings[i+1]))[0]
        
        # append to list of convos
        convos.append({
            'heading': heading_text,
            'full_text': full_text
        })
    
    
    parsed_convos = []
    
    # for each convo
    for convo in convos:
        # get the split points
        split_points = get_split_points(convo)

        # build the comment tree
        comment_tree = build_comment_tree(
            convo=convo,
            split_points=split_points,
            url=source_url,
            pid_meta_set=pid_meta_set
        )
        
        # if the root has no children or there are no sparql queries, skip
        if len(comment_tree.children) == 0 or comment_tree.last_sparql_query is None:
            continue
        
        # otherwise append to list of parsed convos
        parsed_convos.append(comment_tree)
        
    return parsed_convos

In [77]:
# set-up
pid_meta_set = set()
convo_meta_set = []
archive_prefix = "Request a query"

# get all pages in the "Request a query" namespace
archive_pages = get_pages_with_prefix(archive_prefix, namespace=4)

In [78]:
len(convo_meta_set)

0

In [79]:
session = mwapi.Session('https://www.wikidata.org', user_agent='raq')

# for each of those pages
for page in archive_pages:
    # if it's properly formatted
    match = re.findall(title_regex, page['title'])
    if len(match) > 0:
        # parse it and add it to the conversation meta set
        # if f'{match[0][0]}_{match[0][1]}' in ('2017_04', '2017_05', '2017_07', '2019_03', '2020_05', '2020_06', '2020_07', '2020_10', '2020_11'):
        convo_meta_set += parse_roq_convo(session, page['title'], pid_meta_set)

2016_07
2016_08
2016_09
2016_10
2016_11
2016_12
2017_01
2017_02
2017_03
2017_04
2017_05
2017_06
2017_07
2017_08
2017_09
2017_10
2017_11
2017_12
2018_01
2018_02
2018_03
2018_04
2018_05
2018_06
2018_07
2018_08
2018_09
2018_10
2018_11
2018_12
2019_01
2019_02
2019_03
2019_04
2019_05
2019_06
2019_07
2019_08
2019_09
2019_10
2019_11
2019_12
2020_01
2020_02
2020_03
2020_04
2020_05
2020_06
2020_07
2020_08
2020_09
2020_10
2020_11
2020_12
2021_01
2021_02
2021_03
2021_04
2021_05
2021_06
2021_07
:
:The usual suspect been beaten [//w.wiki/3dTX by about 3 times] right ? [[User:Simon Villeneuve|Simon Villeneuve]] ([[User talk:Simon Villeneuve|<span class="signature-talk">{{int:Talkpagelinktext}}</span>]]) 14:32, 13 July 2021 (UTC) [//w.wiki/3dTX by about 3 times]
2021_08
2021_09
2021_10
2021_11
2021_12
2022_01
2022_02
2022_03
2022_04
2022_05
2022_06
2022_07
2022_08
2022_09
2022_10
2022_11
2022_12
I made a query for [https://w.wiki/66fc mountains in iceland] and I'm getting 3500 results. Is it possible

In [86]:
serialized_convos = [serialize_comment_tree(convo) for convo in convo_meta_set]

In [87]:
serialized_convos

[{'comment_id': 0,
  'parent_id': None,
  'preceding_id': None,
  'text': 'Items with specific string in image filename',
  'depth': 0,
  'sparql_queries': [],
  'sparql_queries_clean': [],
  'qids': {},
  'pids': {},
  'url': 'https://www.wikidata.org/wiki/Wikidata:Request_a_query/Archive/2016/07',
  'last_sparql_query': 'SELECT ?item ?itemLabel ?image WHERE {\n  ?item wdt:P31 wd:Q5 .\n  ?item wdt:P18 ?image .\n  values ?item {wd:Q2104}\n  FILTER(CONTAINS(?image, "1958"))\n  SERVICE wikibase:label {\n    bd:serviceParam wikibase:language "en,de,fr".\n  }\n}\nLIMIT 1000',
  'last_sparql_query_clean': 'SELECT ?item ?itemLabel ?image WHERE {\n  ?item wdt:[instance of] wd:[human] .\n  ?item wdt:[image] ?image .\n  values ?item {wd:[Mike Hawthorn]}\n  FILTER(CONTAINS(?image, "1958"))\n  SERVICE wikibase:label {\n    bd:serviceParam wikibase:language "en,de,fr".\n  }\n}\nLIMIT 1000',
  'last_sparql_query_optimized': None,
  'children': [{'comment_id': 1,
    'parent_id': 0,
    'preceding_i

In [88]:
len(serialized_convos)

2780

In [89]:
len(pid_meta_set)

1160

In [90]:
with open('data/conversations.json', "w") as f:
    json.dump(serialized_convos, f, indent=2, ensure_ascii=False)

In [91]:
with open('data/conversation_pids.json', 'w') as f:
    json.dump(list(pid_meta_set), f, indent=2, ensure_ascii=False)

## Check for duplicate headings in other pages

In [92]:
from collections import defaultdict

# given a page title, parse all of the conversations on that page
def check_headings(session, title):
    heading_dict = defaultdict(int)
    # set source url
    source_url = f'https://www.wikidata.org/wiki/{title}'.replace(' ', '_')

    # get year and month strings for easy start/stop
    [year, month] = re.findall(title_regex, title)[0]
    month_str = f'{year}_{month}'
    print(month_str)
    
    # get page content, then parse it and get headings
    resp = session.get(
        formatversion=2,
        action='query',
        prop='revisions',
        rvslots='*',
        rvprop='content',
        titles=title
    )
    content = resp['query']['pages'][0]['revisions'][0]['slots']['main']['content']
    wikitext = parser.parse(content)
    headings = wikitext.filter_headings()
    for h in headings:
        heading_dict[str(h)] += 1
    return heading_dict

In [93]:
meta_heading_dict = {}

# for each of those pages
for page in archive_pages:
    # if it's properly formatted
    match = re.findall(title_regex, page['title'])
    if len(match) > 0:
        meta_heading_dict[f'{match[0][0]}_{match[0][1]}'] = check_headings(session, page['title'])

2016_07
2016_08
2016_09
2016_10
2016_11
2016_12
2017_01
2017_02
2017_03
2017_04
2017_05
2017_06
2017_07
2017_08
2017_09
2017_10
2017_11
2017_12
2018_01
2018_02
2018_03
2018_04
2018_05
2018_06
2018_07
2018_08
2018_09
2018_10
2018_11
2018_12
2019_01
2019_02
2019_03
2019_04
2019_05
2019_06
2019_07
2019_08
2019_09
2019_10
2019_11
2019_12
2020_01
2020_02
2020_03
2020_04
2020_05
2020_06
2020_07
2020_08
2020_09
2020_10
2020_11
2020_12
2021_01
2021_02
2021_03
2021_04
2021_05
2021_06
2021_07
2021_08
2021_09
2021_10
2021_11
2021_12
2022_01
2022_02
2022_03
2022_04
2022_05
2022_06
2022_07
2022_08
2022_09
2022_10
2022_11
2022_12
2023_01
2023_02
2023_03
2023_04
2023_05
2023_06
2023_07
2023_08
2023_09
2023_10
2023_11
2023_12
2024_01
2024_02
2024_03
2024_04
2024_05


In [94]:
for m, headings in meta_heading_dict.items():
    for h, ct in headings.items():
        if ct > 1:
            print(m, h)

### todo:
- [x] fix pop-back behavior
- exclude conversations with:
  - [x] 0 comments
  - [x] 0 sparql queries
- [x] keep track of total number of properties seen
- [x] write to json file
- [x] add source URL for a comment
- [x] replace text-based PIDs and QIDs (and P/Q templates) with `` `<title> (<ID>)` ``
- [x] fix PIDs and QIDs to always be properly capitalized in text and in the pid list
- [x] replace all sparql templates and query urls with markdown-formatted sparql
- [x] run internally on stat machine