In [None]:
import pandas as pd
import difflib,nltk
from IPython.core.display import display, HTML

dataset = pd.read_csv('data/preprints_full_20190901_20200430.csv')
print (len(dataset))

In [None]:
dataset.head()

In [None]:
def show_diff(text, n_text):
    """
    Based upon http://stackoverflow.com/a/788780
    Unify operations between two compared strings seqm is a difflib.
    SequenceMatcher instance whose a & b are strings
    """
    
    seqm = difflib.SequenceMatcher(None, text, n_text)
    ratio = float(1.0 - seqm.ratio())
    output= []
    removals = get_all_removals(text,n_text)
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(seqm.a[a0:a1])
        elif opcode == 'insert':
            check_rearr = find_rearrangements(removals,seqm.b[b0:b1],"green")
            output.append(check_rearr )
        elif opcode == 'delete':
            output.append("<strike><font color=red>" + seqm.a[a0:a1] + "</font></strike>")
        elif opcode == 'replace':
            # seqm.a[a0:a1] -> seqm.b[b0:b1]
            check_rearr = find_rearrangements(removals,seqm.b[b0:b1],"blue")
            output.append("<strike><font color=red>"+ seqm.a[a0:a1] + "</font></strike> "+ check_rearr )
        else:
            raise (RuntimeError, "unexpected opcode")
    return ratio, ''.join(output)

In [None]:
def get_all_removals(text, n_text):
    seqm = difflib.SequenceMatcher(None, text, n_text)
    output= []    
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'delete':
            output.append(seqm.a[a0:a1])
        elif opcode == 'replace':
            output.append(seqm.a[a0:a1])
    return ''.join(output)  

ngram_up_to = 2
def get_all_ngrams(text,ngram_up_to):
    """Returns all ngrams from a text up to a certain number.
    
    Args:
        text: a string.
        ngram_up_to: a integer.
    Returns:
        A list of ngrams.
    """
    
    tokens = text.split(" ")
    ngrams = [" ".join(x) for n in range(2,ngram_up_to+1) for x in nltk.ngrams(tokens,n)]  
    match_ngrams_tokens = {}
    for x in range(len(tokens)):
        token = tokens[x]
        match_ngrams_tokens[x] = [x-1,x]
    return ngrams,match_ngrams_tokens,tokens

def find_rearrangements(removals,snippet,colour):
    
    ngrams,match_ngrams_tokens,tokens = get_all_ngrams(snippet,ngram_up_to)

    output= []    
    
    for t in range(len(tokens)):
        token = tokens[t]
        rel_ngrams = match_ngrams_tokens[t]
        check = False
        for ngram_id in rel_ngrams:
            try:
                rel_ngram = ngrams[ngram_id]
                if rel_ngram in removals:
                    check = True
            except Exception as e:
                continue
        if check == True:
            output.append("<font color=orange>" + token+ " </font>")
        else:    
            output.append("<font color="+colour+">" + token+ " </font>")
        
    return ''.join(output)    

In [None]:
def display_changes(doi):
    for index, row in dataset.iterrows():
        if doi == row["doi"]:
            abs1 = row["abstract"]
            abs2 = row["published_abstract"]
            if type(abs2) is not str:
                if type(row["published_pubmed_abstract"]) is str:
                    abs2 = row["published_pubmed_abstract"]
                else:
                    return "Missing Final Abstract"
        #        number = row["Number"]

            change_ratio, out = show_diff(abs1,abs2)
            print ("Computed change ratio:", change_ratio)
            print ()
            display(HTML(out))
                    


In [None]:
display_changes("10.1101/19000828")