In [190]:
import pickle
from itertools import groupby
import random
import mwparserfromhell
import re
from nltk import sent_tokenize

In [101]:
keyfile_tsv_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2010.tsv'
revision_pkl_path = '/Users/rpryzant/Desktop/Wiki_NPOV/en_npov_edits_2010.revision_text.pkl'


In [102]:
def load_comments(tsv_path):
    out = {}
    for l in open(tsv_path):
        parts = l.strip().split('\t')
        out[parts[0]] = {
            'rev_comment': parts[1],
            'rev_user': parts[2],
            'rev_user_text': parts[3],
            'rev_timestamp': parts[4],
            'rev_minor_edit': parts[5]
        }
    return out

comments = load_comments(keyfile_tsv_path)
revisions = pickle.load(open(revision_pkl_path, 'rb'))

In [199]:
def prep_tokenized_wikitext(token_list):
    if 0 in token_list:
        # multiple edits
        return
    
    x = ' '.join(token_list)
    # fix tags
    x = x.replace('< ', '<')
    x = x.replace('</ ', '</')
    x = x.replace(' >', '>')
    x = x.replace(' />', '/>')
    
    parse = mwparserfromhell.parse(x)
    plaintext = parse.strip_code()
    
    # fix pre-tokenization errors
    # replace links with their name
    m = re.match('\[{2}.*\|(.*)\]{2}', plaintext)
    if m:
        plaintext = re.sub('\[{2}.*\|(.*)\]{2}', m.group(1), plaintext)

    # Othwise get rid of the links (no name)
    plaintext = plaintext.replace('[[', '')
    plaintext = plaintext.replace(']]', '')
    
    # rm urls urls
    plaintext = re.sub('\[.*?\]', '', plaintext)

    return plaintext


def diff(prev_str, next_str):
    prev_set = set(prev_str.split())
    next_set = set(next_str.split())
    
    return prev_set.symmetric_difference(next_set)


def get_sents(prev_edit_str, next_edit_str):
    prev_sents = sent_tokenize(prev_edit_str)
    next_sents = sent_tokenize(next_edit_str)
    if len(prev_sents) != len(next_sents):
        return
    
    for prev_sent, next_sent in zip(prev_sents, next_sents):
        diff_size = len(diff(prev_sent, next_sent))
        if diff_size > 0:
            yield prev_sent, next_sent


def sample_revision(rev_id):
    metadata = comments[rev_id]

    if rev_id not in revisions:
        print('ERROR: id mismatch')
        return
    
    revision = revisions[rev_id]
    prevs, nexts = revision
    
    prev_text = prep_tokenized_wikitext(prevs)
    next_text = prep_tokenized_wikitext(nexts)

    if prev_text is None or next_text is None:
        print('MULTIPLE EDITS')
        return

    if not prev_text or not next_text:
        print('LACKING TEXT')
        return
    
    print('=' * 80)
    print('EDIT: %s  COMMENT: %s' % (rev_id, metadata['rev_comment']))
    for prev_sent, next_sent in get_sents(prev_text, next_text):
        print('\t', prev_sent)
        print('\t', next_sent)
        print()

    
#sample_revision('259892922')

In [200]:
for i in range(100):
    sample_revision(random.choice(list(comments)))

MULTIPLE EDITS
MULTIPLE EDITS
MULTIPLE EDITS
MULTIPLE EDITS
MULTIPLE EDITS
EDIT: 371426825  COMMENT: rv. pov wording
	 These highly - publicized actions resulted in statewide acclaim  among liberal democrats  , and led to his election as governor in the next statewide election .
	 These highly - publicized actions resulted in statewide acclaim , and led to his election as governor in the next statewide election .

MULTIPLE EDITS
MULTIPLE EDITS
LACKING TEXT
EDIT: 344721889  COMMENT: /* Unnamed characters played by a notable actor/actress */ redundant - "actor" is gender neutral.
	  Unnamed characters played by a notable actor  / actress
	  Unnamed characters played by a notable actor

EDIT: 393577373  COMMENT: rv extreme pov pushing
	 Male  genital mutilation , or as it is more commonly refereed to , male  circumcision  ,  involves the removal of the  foreskin .
	 Male circumcision involves the removal of the  foreskin .

EDIT: 360678447  COMMENT: making neutral
	 Unfortunately  Nickelo

In [176]:
sample_revision('341673195')

EDIT: 341673195  COMMENT: Made more neutral
13
	  Alphascript  publishing   is a trademark of VDM Publishing House and is used to sell Wikipedia articles as  expensive  books  . Its mercantil philosophy is clearly exposed on Alphascript ' s site  : " with the Wikipedia - texts at free disposal we create books on interesting topics ."   
	  Alphascript  Publishing   is a trademark of VDM Publishing House and is used to sell Wikipedia articles as  commercial print  books : " with the Wikipedia - texts at free disposal we create books on interesting topics ."
