 # Human-annotated portion of dataset (~5k samples)

In [92]:
import json, random
import utils_diff

with open("data/swipe_train.json", "r") as f: # See: data/swipe_val.json, data/swipe_test_id.json, data/swipe_test_ood.json for the validation, in-domain test, and out-of-domain test sets
    swipe_train = json.load(f)

sample = random.choice(swipe_train)

In [93]:
print("Page Pairing: [En Wiki: %s; Revision ID: %s] [Simple Wiki: %s; Revision ID: %s]" % (sample["r_page"], sample["r_revid"], sample["s_page"], sample["s_revid"]))

# Raw text of En page: sample["r_content"]
# Raw text of Simple page: sample["s_content"]

edits = sample["edits"] # Can be recreated through: `utils_diff.get_edit_operations(sample["r_content"], sample["s_content"], split_replace=True, split_sentences=True)`

print("---- Here is the edit sequence to go from the original page to the simplified page ----")
print("Legend: Green text is added in the simple page, red text is deleted from the original page")
print("---")
print(utils_diff.make_colored_text(sample["r_content"], sample["s_content"]))

Page Pairing: [En Wiki: Ben Nsibandze; Revision ID: 1003827254] [Simple Wiki: Ben Nsibandze; Revision ID: 7351885]
---- Here is the edit sequence to go from the original page to the simplified page ----
Legend: Green text is added in the simple page, red text is deleted from the original page
---
Benjamin Mshamndane Nsibandze[1;31m(June 17,[0m[1;32m(17 June[0m 1931 -[1;32m13[0m January[1;31m13,[0m 2021) was a Swazi[1;31mregional adminstrator and deputy prime minister[0m[1;32mpolitician[0m. He[1;31mserved as[0m[1;32mwas Deputy Prime Minister. He was[0m acting Prime Minister of Swaziland from 25 October 1979 to 23 November 1979.[1;31mHe[0m[1;32mNsibandze[0m died[1;31min[0m[1;32mon 13[0m January 2021[1;31m, aged[0m[1;32mat the age of[0m 89.


In [94]:
# Annotation format specifies each edit group through the operation index (opi) of the category assigned to the group
sample["annotations"]

[{'gi': 0, 'opis': [1, 2], 'category': 'nonsim_format'},
 {'gi': 1, 'opis': [4, 6], 'category': 'nonsim_format'},
 {'gi': 2, 'opis': [8, 9], 'category': 'lexical_generic'},
 {'gi': 3, 'opis': [11, 9], 'category': 'discourse_reordering'},
 {'gi': 4, 'opis': [12, 13], 'category': 'discourse_reordering'},
 {'gi': 5, 'opis': [12], 'category': 'syntactic_sentence_splitting'},
 {'gi': 6, 'opis': [15, 16], 'category': 'discourse_anaphora_resolution'},
 {'gi': 7, 'opis': [18, 19], 'category': 'nonsim_fact_correction'},
 {'opis': [21, 22], 'category': 'lexical_generic'}]

In [95]:
# To visualize what each group correspond to, use the utils_vis module
from utils_vis import visualize_edit_groups

visualize_edit_groups(sample["r_content"], sample["s_content"], sample["annotations"])

There are a total of 9 identified groups.
[nonsim_format                 ] Benjamin Mshamndane Nsibandze[1;32m(17 June[0m[1;31m(June 17,[0m 1931 - [...]
[nonsim_format                 ] [...] 1931 -[1;32m13[0m January[1;31m13,[0m 2021) was a Swazi [...]
[lexical_generic               ] [...] 2021) was a Swazi[1;32mpolitician[0m[1;31mregional adminstrator and deputy prime minister[0m. He [...]
[discourse_reordering          ] [...] [1;31mregional adminstrator and deputy prime minister[0m. He[1;32mwas Deputy Prime Minister.[0m [...]
[discourse_reordering          ] [...] [1;32m He was[0m[1;31mserved as[0m acting Prime Minister of Swaziland from 25 October 1979 to 23 November 1979. [...]
[syntactic_sentence_splitting  ] [...] [1;32m He was[0m [...]
[discourse_anaphora_resolution ] [...] acting Prime Minister of Swaziland from 25 October 1979 to 23 November 1979.[1;32mNsibandze[0m[1;31mHe[0m died [...]
[nonsim_fact_correction        ] [...] died[1;32mon 13[0m[1

# Entire dataset (~140k samples)

In [63]:
from collections import Counter
import json, random

with open("data/swipe_full.json", "r") as f:
    swipe_full = json.load(f)

print("Size of dataset: %d" % (len(swipe_full)))

sample = random.choice(swipe_full)
print("Example sample. Input (English Wikipedia) -> Output (Simple Wikipedia)")
print(sample)

Size of dataset: 143359
Example sample. Input (English Wikipedia) -> Output (Simple Wikipedia)
{'input': "Burnham-on-Sea is a seaside town in Somerset, England, at the mouth of the River Parrett, upon Bridgwater Bay. Burnham was a small fishing village until the late 18th century when it began to grow because of its popularity as a seaside resort.\nBurnham-on-Sea forms part of the parish of Burnham-on-Sea and Highbridge and shares a town council with its neighbouring small market town of Highbridge. According to the 2011 census the population of the parish (including Highbridge) was 19,576, of which the most populous wards 'Burnham Central' and 'Burnham North'; totalled 13,601.\nBurnham-on-Sea is most famous for its low lighthouse. The now-decommissioned lighthouse was built in 1832 and is a Grade-II listed building. The lighthouse is famous for its red and white striped facade.\nThe position of the town on the edge of the Somerset Levels and moors where they meet the Bristol Channel, 

In [64]:
import utils_diff

# To make the edits between the input to the output
print(utils_diff.make_colored_text(sample["input"], sample["output"]))

Burnham-on-Sea is a[1;31mseaside[0m[1;32msmall[0m town in[1;32mthe county of[0m Somerset[1;31m,[0m[1;32min[0m England[1;31m, at[0m[1;32mon[0m the mouth of the River Parrett[1;31m, upon[0m[1;32mat[0m Bridgwater Bay. Burnham was a small fishing village until the[1;31mlate 18th[0m[1;32mlate-18th[0m century when it[1;31mbegan to grow because of its popularity[0m[1;32mbecame popular[0m as a seaside resort.[1;31mBurnham-on-Sea forms part of[0m[1;32mThis made[0m the[1;31mparish of Burnham-on-Sea and Highbridge and shares[0m[1;32mvillage grow larger into[0m a town[1;31mcouncil with its neighbouring small market town of Highbridge[0m.[1;31mAccording to the 2011 census the population of the parish(including Highbridge) was 19,576, of which the most populous wards'Burnham Central' and'Burnham North'; totalled 13,601. Burnham-on-Sea is most famous for its low lighthouse. The now-decommissioned lighthouse was built in 1832 and is a Grade-II listed building. The 