### little notebook for poking around at the data, getting a handle for cleaning things up, etc

In [1]:
import pickle
from itertools import groupby
import random
import mwparserfromhell
import re
from nltk import sent_tokenize

In [2]:
keyfile_tsv_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2008.tsv'
revision_pkl_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2008.revision_text.pkl'

In [3]:
def load_comments(tsv_path):
    out = {}
    for l in open(tsv_path):
        parts = l.strip().split('\t')
        out[parts[0]] = {
            'rev_comment': parts[1],
            'rev_user': parts[2],
            'rev_user_text': parts[3],
            'rev_timestamp': parts[4],
            'rev_minor_edit': parts[5]
        }
    return out

comments = load_comments(keyfile_tsv_path)
revisions = pickle.load(open(revision_pkl_path, 'rb'))

In [38]:
def prep_tokenized_wikitext(token_list):
#    token_list = [tok if tok != 0 else '___' for tok in token_list ]

    if 0 in token_list:
        # multiple edits
        return
    
    x = ' '.join(token_list)
    # fix tags
    x = x.replace('< ', '<')
    x = x.replace('</ ', '</')
    x = x.replace(' >', '>')
    x = x.replace(' />', '/>')
    
    parse = mwparserfromhell.parse(x)
    plaintext = parse.strip_code()
    
    # fix pre-tokenization errors
    # replace links with their name
    m = re.match('\[{2}.*\|(.*)\]{2}', plaintext)
    if m:
        plaintext = re.sub('\[{2}.*\|(.*)\]{2}', m.group(1), plaintext)

    # Othwise get rid of the links (no name)
    plaintext = plaintext.replace('[[', '')
    plaintext = plaintext.replace(']]', '')
    
    # rm [urls] and urls
    plaintext = re.sub('\[.*?\]', '', plaintext)
    # TODO -- tokenized urls 
#    print(plaintext)
    return plaintext


def diff(prev_str, next_str):
    prev_set = set(prev_str.split())
    next_set = set(next_str.split())
    
    return prev_set.symmetric_difference(next_set)


def get_sents(prev_edit_str, next_edit_str):
    prev_sents = sent_tokenize(prev_edit_str)
    next_sents = sent_tokenize(next_edit_str)
    if len(prev_sents) != len(next_sents):
        return
    
    for i, (prev_sent, next_sent) in enumerate(zip(prev_sents, next_sents)):
        diff_size = len(diff(prev_sent, next_sent))
        if diff_size > 0:
            prev_ctx = prev_sents[i - 1] if i > 0 else ''
            post_ctx = prev_sents[i + 1] if i < len(prev_sents) - 1 else ''
            yield prev_sent, next_sent, prev_ctx + ' || ' + post_ctx


def examples_from_revision(rev_id):
    print(rev_id)
    metadata = comments[rev_id]

    if rev_id not in revisions:
        yield 'ERROR: id mismatch'

    revision = revisions[rev_id]
    prevs, nexts = revision
    
    prev_text = prep_tokenized_wikitext(prevs)
    next_text = prep_tokenized_wikitext(nexts)
    
    if prev_text is None or next_text is None:
        yield 'MULTIPLE EDITS'

    if len(prev_text) == 0 or len(next_text) == 0:
        yield 'LACKING TEXT'
    
    for prev_sent, next_sent, context in get_sents(prev_text, next_text):
        ex = (
            rev_id,
            metadata['rev_comment'],
            prev_sent,
            next_sent,
            context
        )
        yield ex
        
def sample_revision(rev_id):
    try:
        ex = next(examples_from_revision(rev_id))
    except:
        ex = 'NO EXAMPLES'
    print('=' * 80)
    if isinstance(ex, str):
        print(ex)
    else:
        print('EDIT: %s \n COMMENT: %s \n\t%s\n\t%s\n\t%s' % ex)
    
#sample_revision('259892922')

In [39]:
for i in range(100):
    sample_revision(random.choice(list(comments)))

230733470
MULTIPLE EDITS
260203068
MULTIPLE EDITS
221579447
EDIT: 221579447 
 COMMENT: pov 
	These Pokmon can be  illegitimately  obtained by using a  GameShark  or similar cheating device like Action Replay .
	These Pokmon can be obtained by using a  GameShark  or similar cheating device like Action Replay .
	They are obtained by using the Mystery Gift function ; in some cases , exploiting a glitch in the game itself ; or directly from Nintendo at promotional events . || Players are not required to own these Pokmon , among others , in order to have a complete  Pokdex .
225636756
MULTIPLE EDITS
227336794
LACKING TEXT
226993854
EDIT: 226993854 
 COMMENT: /* Modern excavation */ I think "non-independent" here is redundant and pov in itself. just relaying the teams findings will let the user make up his/her own mind. 
	Between 1939 and 1949 the  non - independent  Vatican - led archaeological team had uncovered a complex of undoubtedly  pagan  mausoleums under the foundations of  St .
	Be

In [228]:
sample_revision('226054404')

EDIT: 226054404  COMMENT: On second thought: "Northern Epirus" is not a neutral geographical term, but I guess as a marker of the minority it does make sense here
	 The population  is indigenous ethnic  Greek   http :// mondediplo .
	 It  is  home to an  indigenous ethnic  Greek   population , part of the  Northern Epirote  Greek minority   http :// mondediplo .
	 It is located less than 1 km north - west of the  Greek  border . || com / maps / albanianmdv1999 The Albanians , a scattered people by Philippe Rekacewicz , Le Monde diplomatique , January 1999  http :// www .



In [261]:
i = 0
for rev_id in list(comments)[1:]:
    for ex in examples_from_revision(rev_id):
        if not isinstance(ex, str):
            i += 1
print(i)

237443965
237443965
243920179
243920179


TypeError: expected string or bytes-like object