In [1]:
import pandas as pd
from pathlib import Path
import re
from typing import Dict
import numpy as np

In [2]:
DATA_DIR = Path("/projectnb/cs505ws/projects/grammar_ninja_alavaee/data/grammar/raw")

In [3]:
coedit = pd.read_parquet(DATA_DIR.joinpath("coedit", "train.parquet"))

In [4]:
coedit.head()

Unnamed: 0,task,prompts,input_text,output_text
0,gec,Remove all grammatical errors from this text,"For example, countries with a lot of deserts ...","For example, countries with a lot of deserts c..."
1,gec,Improve the grammaticality,"As the number of people grows, the need of ha...","As the number of people grows, the need for a ..."
2,gec,Improve the grammaticality of this sentence,Besides some technologically determinists tha...,Besides some technological determinists that a...
3,gec,Remove all grammatical errors from this text,Safety is one of the crucial problems that ma...,Safety is one of the crucial problems that man...
4,gec,Fix grammaticality in this sentence,On one hand more and more virus and hack can ...,"On the one hand, more and more viruses and hac..."


In [9]:
coedit.task.unique()

array(['gec', 'neutralize', 'simplification', 'paraphrase', 'coherence',
       'clarity'], dtype=object)

In [10]:
coedit.task.value_counts()

task
gec               19823
paraphrase        15370
simplification    11440
coherence         10616
neutralize        10570
clarity            1252
Name: count, dtype: int64

In [11]:
coedit.src.apply(lambda s: len(s)).max()

884

In [12]:
coedit.src.apply(lambda s: len(s)).mean()

127.42111740093527

In [13]:
prompts = coedit.src.str.extract(r"^(.*?):")

In [14]:
prompts.iloc[:, 0].unique()

array(['Remove all grammatical errors from this text',
       'Improve the grammaticality',
       'Improve the grammaticality of this sentence',
       'Fix grammaticality in this sentence',
       'Improve the grammar of this text', 'Fix grammar in this sentence',
       'Fix grammar in the sentence', 'Update to remove grammar errors',
       'Fix grammar errors in this sentence',
       'Remove grammatical mistakes', 'Fix disfluencies in the sentence',
       'Fix grammaticality', 'Fix grammaticality of the sentence',
       'Fix grammar', 'Make the sentence grammatical',
       'Improve the grammaticality of this text', 'Fix grammar errors',
       'Fix grammatical errors in this sentence',
       'Fix all grammatical errors', 'Fix the grammatical mistakes',
       'Fix errors in this text', 'Make the sentence fluent',
       'Fix grammatical errors', 'Grammar improvements',
       'Fix grammatical mistakes in this sentence',
       'Remove grammar mistakes', 'Fix the grammar mista

In [15]:
coedit["prompts"] = coedit.src.str.extract(r"^(.*?):")

In [16]:
coedit["input_text"] = coedit.src.str.extract(r":(.+)")

In [17]:
coedit.rename({"tgt": "output_text"}, axis=1)

Unnamed: 0,_id,task,src,output_text,prompts,input_text
0,1,gec,Remove all grammatical errors from this text: ...,"For example, countries with a lot of deserts c...",Remove all grammatical errors from this text,"For example, countries with a lot of deserts ..."
1,2,gec,Improve the grammaticality: As the number of p...,"As the number of people grows, the need for a ...",Improve the grammaticality,"As the number of people grows, the need of ha..."
2,3,gec,Improve the grammaticality of this sentence: B...,Besides some technological determinists that a...,Improve the grammaticality of this sentence,Besides some technologically determinists tha...
3,4,gec,Remove all grammatical errors from this text: ...,Safety is one of the crucial problems that man...,Remove all grammatical errors from this text,Safety is one of the crucial problems that ma...
4,5,gec,Fix grammaticality in this sentence: On one ha...,"On the one hand, more and more viruses and hac...",Fix grammaticality in this sentence,On one hand more and more virus and hack can ...
...,...,...,...,...,...,...
69066,69067,clarity,Rewrite this sentence for clarity: The Habsbur...,"During the Habsburg's period, Spain ushered in...",Rewrite this sentence for clarity,The Habsburgyears also ushered in the Spanish...
69067,69068,clarity,Rewrite the sentence more clearly: The Habsbur...,The Habsburgyears also ushered in the Spanish ...,Rewrite the sentence more clearly,The Habsburgyears also ushered in the Spanish...
69068,69069,clarity,"Make this sentence more readable: In 2019, he ...","In 2019, he was traded to the Astros in a bloc...",Make this sentence more readable,"In 2019, he was traded to the Astros in a blo..."
69069,69070,clarity,"Use clearer wording: In 2019, he was traded to...","In 2019, he was traded to the Astros in a bloc...",Use clearer wording,"In 2019, he was traded to the Astros in a blo..."


In [18]:
wi_locness = pd.read_parquet(DATA_DIR.joinpath("wi_locness", "validation.parquet"))

In [19]:
wi_locness.head()

Unnamed: 0,id,cefr,text,edits
0,7-5819177,N,"Boxing is a common, well known and well loved ...","{'start': [24, 39, 52, 87, 242, 371, 400, 528,..."
1,7-5819215,N,In many of the minority works of modern litera...,"{'start': [525, 954, 1819, 2154, 2200], 'end':..."
2,7-5819167,N,How does Voltaire tackle the question of philo...,"{'start': [154, 155, 254, 414, 490, 576, 672, ..."
3,7-5819168,N,It would be tempting to think of the various E...,"{'start': [99, 193, 564, 1245, 1321, 1987, 204..."
4,7-5819169,N,"Naturally, the problems of a single Europe wou...","{'start': [145, 338, 552, 563, 571, 713, 841, ..."


In [20]:
wi_locness.loc[0, "edits"]

{'start': array([  24,   39,   52,   87,  242,  371,  400,  528,  589,  713,  869,
         992, 1058, 1169, 1209, 1219, 1255, 1308, 1386, 1412, 1513, 1569,
        1661, 1731, 1744, 1781, 1792, 1901, 1951, 2038, 2131, 2149, 2247,
        2286], dtype=int32),
 'end': array([  25,   40,   59,   95,  249,  374,  400,  538,  595,  713,  869,
        1001, 1063, 1169, 1209, 1219, 1255, 1315, 1390, 1418, 1517, 1570,
        1661, 1737, 1751, 1781, 1799, 1901, 1960, 2044, 2131, 2149, 2248,
        2289], dtype=int32),
 'text': array(['-', '-', 'in', '. However,', '. There', 'their', ',', 'among',
        "there's", ' and', ',', 'underground', '. The', ',', ',', ',', ',',
        '. There', 'for', 'Changing', 'from', ';', ',', 'later', '. These',
        "'", 'talent', ',', '. Diseases', '. Even', ',', "'s", ';', 'have'],
       dtype=object)}

In [21]:
wi_locness.loc[0, "text"]

'Boxing is a common, well known and well loved sport amongst most countries in the world however it is also punishing, dangerous and disliked to the extent that many people want it banned, possibly with good reason.\nBoxing is a dangerous sport, there are relatively common deaths, tragic injuries and even disease. All professional boxers are at risk from being killed in his next fight. If not killed then more likely paralysed. There have been a number of cases in the last ten years of the top few boxers having tragic losses throughout their ranks. This is just from the elite few, and theres more from those below them.\nMore deaths would occur through boxing if it were banned. The sport would go underground, there would be no safety measures like gloves, a doctor, paramedics or early stopping of the fight if someone looked unable to continue. With this going on the people taking part will be dangerous, and on the streets. Dangerous dogs who were trained to kill and maim in similar under

In [22]:
start = wi_locness.loc[0, "edits"]["start"]
end = wi_locness.loc[0, "edits"]["end"]
replacement_text = wi_locness.loc[0, "edits"]["text"]

In [23]:
replacement_text

array(['-', '-', 'in', '. However,', '. There', 'their', ',', 'among',
       "there's", ' and', ',', 'underground', '. The', ',', ',', ',', ',',
       '. There', 'for', 'Changing', 'from', ';', ',', 'later', '. These',
       "'", 'talent', ',', '. Diseases', '. Even', ',', "'s", ';', 'have'],
      dtype=object)

In [24]:
wi_locness.loc[0, "text"][start[2] : end[2]]

'amongst'

In [25]:
replacement_text[2]

'in'

In [26]:
def apply_corrections(text: str, corrections: Dict[str, np.ndarray]):
    # The corrected text starts as the original text
    corrected_text = text
    offset = 0

    # Iterate over the corrections
    for start, end, correction in zip(
        corrections["start"], corrections["end"], corrections["text"]
    ):
        # Adjust start and end with the offset
        start += offset
        end += offset

        # Replace the text if correction exists
        if correction:
            corrected_text = corrected_text[:start] + correction + corrected_text[end:]
            # Update the offset
            offset += len(correction) - (end - start)
            
        else:
            corrected_text = corrected_text[:start] + corrected_text[end:]
            # Update the offset
            offset -= (end - start)

    return re.sub("\n+", "\n", corrected_text)

In [27]:
wi_locness["corrected_text"] = wi_locness.apply(lambda row: apply_corrections(row["text"], row["edits"]), axis=1)

In [28]:
wi_locness.corrected_text.head()

0    Boxing is a common, well-known and well-loved ...
1    In many of the minority works of modern litera...
2    How does Voltaire tackle the question of philo...
3    It would be tempting to think of the various E...
4    Naturally, the problems of a single Europe wou...
Name: corrected_text, dtype: object

In [29]:
print(len(wi_locness.corrected_text[0].split("\n")[0]))

211


In [30]:
wi_locness.edits[0]

{'start': array([  24,   39,   52,   87,  242,  371,  400,  528,  589,  713,  869,
         992, 1058, 1169, 1209, 1219, 1255, 1308, 1386, 1412, 1513, 1569,
        1661, 1731, 1744, 1781, 1792, 1901, 1951, 2038, 2131, 2149, 2247,
        2286], dtype=int32),
 'end': array([  25,   40,   59,   95,  249,  374,  400,  538,  595,  713,  869,
        1001, 1063, 1169, 1209, 1219, 1255, 1315, 1390, 1418, 1517, 1570,
        1661, 1737, 1751, 1781, 1799, 1901, 1960, 2044, 2131, 2149, 2248,
        2289], dtype=int32),
 'text': array(['-', '-', 'in', '. However,', '. There', 'their', ',', 'among',
        "there's", ' and', ',', 'underground', '. The', ',', ',', ',', ',',
        '. There', 'for', 'Changing', 'from', ';', ',', 'later', '. These',
        "'", 'talent', ',', '. Diseases', '. Even', ',', "'s", ';', 'have'],
       dtype=object)}

In [31]:
wi_locness.text.apply(lambda s: len(s)).max()

3335

In [32]:
wi_locness.text.apply(lambda s: len(s)).mean()

2408.82

In [33]:
wi_locness.corrected_text.apply(lambda s: len(s)).mean()

2411.62

In [34]:
# split by newlines to match format of other coedit dataset

print(wi_locness.text[0].split("\n"))

print(wi_locness.corrected_text[0].split("\n"))

['Boxing is a common, well known and well loved sport amongst most countries in the world however it is also punishing, dangerous and disliked to the extent that many people want it banned, possibly with good reason.', 'Boxing is a dangerous sport, there are relatively common deaths, tragic injuries and even disease. All professional boxers are at risk from being killed in his next fight. If not killed then more likely paralysed. There have been a number of cases in the last ten years of the top few boxers having tragic losses throughout their ranks. This is just from the elite few, and theres more from those below them.', 'More deaths would occur through boxing if it were banned. The sport would go underground, there would be no safety measures like gloves, a doctor, paramedics or early stopping of the fight if someone looked unable to continue. With this going on the people taking part will be dangerous, and on the streets. Dangerous dogs who were trained to kill and maim in similar 

In [35]:
wi_locness.corrected_text[0].split("\n")

['Boxing is a common, well-known and well-loved sport in most countries in the world. However, it is also punishing, dangerous and disliked to the extent that many people want it banned, possibly with good reason.',
 "Boxing is a dangerous sport. There are relatively common deaths, tragic injuries and even disease. All professional boxers are at risk from being killed in their next fight. If not killed, then more likely paralysed. There have been a number of cases in the last ten years of the top few boxers having tragic losses among their ranks. This is just from the elite few, and there's more from those below them.",
 'More deaths would occur through boxing if it were banned. The sport would go underground, and there would be no safety measures like gloves, a doctor, paramedics or early stopping of the fight if someone looked unable to continue. With this going on, the people taking part will be dangerous, and on the streets. Dangerous dogs who were trained to kill and maim in simil

In [36]:
alternative_prompts = [
    "Correct the grammatical errors in the following sentence:",
    "Revise this text for proper grammar:",
    "Improve the grammatical structure of this sentence:",
    "Transform the words into grammatically correct English:",
    "Edit this sentence to eliminate grammatical mistakes:"
]

In [40]:
corrected_text = pd.Series(np.concatenate([wi_locness.corrected_text[i].split("\n")
                                           for i in range(wi_locness.shape[0])]))

In [44]:
corrected_text.map(lambda s: re.sub(r"\s+", " ", s))

0      Boxing is a common, well-known and well-loved ...
1      Boxing is a dangerous sport. There are relativ...
2      More deaths would occur through boxing if it w...
3      Once boxing is banned and no-one grows up know...
4      Changing the rules of boxing slightly would mu...
                             ...                        
269    The V-chip is an electrical device that blocks...
270    Some of the anti-violence proposals are as fol...
271    Some television shows show violence because it...
272    An exhaustive examination of the broadcast net...
273    The reason for television violence is to add t...
Length: 274, dtype: object

In [38]:
random_prompts = np.random.choice(alternative_prompts, size=wi_locness.shape[0], replace=True)