In [2]:
import pandas as pd

# import a spreadsheeet containing all preprints and final abstracts pairs
df = pd.read_csv("all_pairs - old.tsv", sep='\t')

change_in_abst_length = {}

page = 1

for index, row in df.iterrows():
    # we skip abstracts that are not included in the MS Word track changes
    # this is becausee some final abstracts are not available
    if row["exclude"] != "exclude":
        # removing all multiple whitespaces, line breaks, tabs
        preprint_abstract = " ".join(row["abstract"].split())
        final_abstract = " ".join(row["published_pubmed_abstract"].split())
        len_preprint_abstract = len(preprint_abstract)
        len_final_abstract = len(final_abstract)
        # store the length of each abstract using the page number as an id
        change_in_abst_length[page] = [len_preprint_abstract,len_final_abstract]
        page+=1

In [3]:
# open a .txt containing the extracted trackchanges from MS Word
trackchange_doc = open("compared_abstracts_2020-07-14_v1_markup.txt",'r').read().strip().split("\n")

beginning_of_change = []
changes_abs_match = {}

# first we find the starting line of each change
for e in range(len(trackchange_doc)):
    element = trackchange_doc[e].strip()
    # so if the element is equal to Page
    if element == "Page":
        #  and there is a number + : in the next line
        number = trackchange_doc[e+1].strip()
        # we can presume it's the beginning of a new change
        if ":" == number[-1] and number[:-1].isnumeric():
            # so we append the id and to which page this id refers to
            # removing the ":" at the end of the line
            beginning_of_change.append(e)
            changes_abs_match[e] = number[:-1]

In [4]:
# we create two dictionaries where we set to 0 the change counter to each page
abstract_changecount = {y:0 for x,y in changes_abs_match.items()}
abstract_changetypes = {y:{'Inserted': 0, 'Deleted': 0, 'Moved': 0} for x,y in changes_abs_match.items()}

# for each change, we know to which abstract (i.e., page) it belongs 
for c in range(len(beginning_of_change)):
    change = beginning_of_change[c]
    abstract_id = changes_abs_match[change]
    # here we know where to stop, at the beginning of the next change (or at the end of the doc)
    if c+1 < len(beginning_of_change):
        nextchange = beginning_of_change[c+1]
    else:
        nextchange = len(beginning_of_change)
    # change+8 is in order to skip all metadata and whitespacees, 
    #like hour, username etc that are always the same length
    words = [trackchange_doc[x] for x in range(change+8,nextchange) if len(trackchange_doc[x])>0]
    words = " ".join(words).strip()

    # this is the type of change (addition, deletion, move)
    typechange = trackchange_doc[change+2].strip()
    abstract_changetypes[abstract_id][typechange] +=len(words)

    # we currently ignore moved from the overall count
    if typechange != "Moved":
        abstract_changecount[abstract_id]+=len(words)
    
# here an example
print (abstract_changecount["180"],abstract_changetypes["180"])

22 {'Inserted': 13, 'Deleted': 9, 'Moved': 0}


In [5]:
# for all the abstracts under study
for i in range(1,len(change_in_abst_length)+1):
    # if we have at least 1 trackchanged detected (otherwise the two abstracts are identical, so the changeratio is 0)
    if str(i) in abstract_changecount:

        change_len = abstract_changecount[str(i)]
        inserted,deleted,moved = abstract_changetypes[str(i)]["Inserted"],abstract_changetypes[str(i)]["Deleted"],abstract_changetypes[str(i)]["Moved"]
        
        # here we obtain the len of the preprint and the final version
        original,final = change_in_abst_length[i]
        # and consequently we derive the len of identical characters
        identical = final - inserted
        
        # if there is at least one change
        if change_len != 0:
            # we compute the changeratio as in difflib (see: https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio)
            changeratio = 1.0 - ((2*identical)/(original+final))
            print (changeratio)
        else:
            print (0)
    else:
        print (0)

0.04772727272727273
0.030637870416875934
0.10664479081214107
0.512753433616743
0
0.020266357845975635
0.04656862745098034
0.24841695080370185
0.12829629629629624
0.28935905413814567
0.22917251051893406
0.26876640419947506
0.2271944922547332
0.3305613305613305
0.10707803992740472
0.2980608091932009
0.007719298245614015
0.034782608695652195
0.5363321799307958
0.11652493679613884
0.07381889763779526
0.022031823745410017
0.06398104265402849
0.7041800643086817
0.02910052910052907
0.2672750977835724
0.00540175557056044
0.09421551557361196
0.07989821882951653
0.1148479427549195
0.015492253873063522
0.09292763157894735
0.05594405594405594
0.1082728592162554
0.04800000000000004
0.06171574903969268
0.009358914365933502
0.08616780045351469
0.611417450535193
0.28120145002589336
0.18175372252422595
0.03956834532374098
0.008409014463504838
0.01520467836257311
0.1953735542356987
0.0727824109173616
0.026622793897696728
0.17517328292375556
0.17018859350147697
0.028449880350970513
0.06621226874391428
0
