In [52]:
from datasets import load_dataset, concatenate_datasets

In [53]:
def normalise(data):
    # Lowercase the text
    data["source"] = data["source"].lower()
    data["target"] = data["target"].lower()

    # Remove new line characters
    data["source"] = data["source"].replace("\n", " ")

    return data


In [54]:
narrative_data = (
    load_dataset("narrativeqa", trust_remote_code=True)
    .select_columns(["document", "question"])
    .map(
        lambda x: {
            "document": x["document"]["summary"]["text"],
            "question": x["question"]["text"],
        }
    )
    .rename_columns({"document": "source", "question": "target"})
    .map(normalise)
)

Downloading builder script:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/187M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32747 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10557 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3461 [00:00<?, ? examples/s]

Map:   0%|          | 0/32747 [00:00<?, ? examples/s]

Map:   0%|          | 0/10557 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

Map:   0%|          | 0/32747 [00:00<?, ? examples/s]

Map:   0%|          | 0/10557 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

## Observed character encoding issues
- â
- â â
- â
- â
- â
- â
- â˛
- ă
- âł
- â
- ĺ
- âź
- (num)â 

In total, there are 13 instances of encoding issues in the NarrativeQA summaries.

In [55]:
issue_1 = narrative_data.filter(lambda x: "â" in x["source"])
issue_1

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 3371
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 1085
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 276
    })
})

In [56]:
issue_2 = narrative_data.filter(lambda x: "â â" in x["source"])
issue_2

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 89
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 58
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [57]:
issue_3 = narrative_data.filter(lambda x: "â" in x["source"])
issue_3

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1932
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 627
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 235
    })
})

In [58]:
issue_4 = narrative_data.filter(lambda x: "â" in x["source"])
issue_4

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1088
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 233
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 59
    })
})

In [59]:
issue_5 = narrative_data.filter(lambda x: "â" in x["source"])
issue_5

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 148
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 119
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [60]:
issue_6 = narrative_data.filter(lambda x: "â" in x["source"])
issue_6

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 145
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 90
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [61]:
issue_7 = narrative_data.filter(lambda x: "â˛" in x["source"])
issue_7

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 30
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [62]:
issue_8 = narrative_data.filter(lambda x: "ă" in x["source"])
issue_8

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [63]:
issue_9 = narrative_data.filter(lambda x: "âł" in x["source"])
issue_9

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 491
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 89
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
})

In [64]:
issue_10 = narrative_data.filter(lambda x: "â" in x["source"])
issue_10

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 30
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [65]:
issue_11 = narrative_data.filter(lambda x: "ĺ" in x["source"])
issue_11

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 29
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [66]:
issue_12 = narrative_data.filter(lambda x: "âź" in x["source"])
issue_12

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 30
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 29
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [67]:
issue_13 = narrative_data.filter(lambda x: "â " in x["source"])
issue_13

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 499
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 245
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
})

In [69]:
for i, issue in enumerate([issue_1, issue_2, issue_3, issue_4, issue_5, issue_6, issue_7, issue_8, issue_9, issue_10, issue_11, issue_12, issue_13]):
    concat_dataset = concatenate_datasets(
            [issue["train"], issue["test"], issue["validation"]]
        )
    examples = concat_dataset.shuffle(seed=1).select(range(3))

    print(f"Issue {i+1}:\n")
    for example in examples:
        print([sentence + '.' for sentence in example["source"].split('.') if 'â' in sentence or 'ă' in sentence or 'ĺ' in sentence][0])
    print('\n')


Issue 1:

 he assembles six men from his companyât/sgt.
 mason contacts o'malley, who supplies him with weapons and tells him that his son is still aliveâo'malley adopted mason's son and sent him to a private school so that he would be out of danger.
 working for different companies, the "tin men" are prepared to do almost anythingâlegal or illegalâto close a sale.


Issue 2:

 ash eventually finds the real one and attempts to say the magic phrase that will allow him to remove the book safelyâ â "klaatu barada nikto".
 there is still left the consolation that a happy end would come for humanity as a wholeâ â though hundreds of years too late for avis and ernest as individuals; the cruel oligarchy would fall, and the two will be vindicated and respected by posterity as pioneers and martyrs.
 ash eventually finds the real one and attempts to say the magic phrase that will allow him to remove the book safelyâ â "klaatu barada nikto".


Issue 3:

 a group of college friends c

Looking at the output of the examples for each issue, we should replace the characters with:

1. ', '
2. ' -'
3. '-'
4. "'"
5. ''
6. ''
7. ''
8. 'é'
9. '$'
10. ''
11. 'ō'
12. '€'
13. '(num)'

In [8]:
text = """
a 50-foot (asd15â m) containment wall surrounds the island and routes out of manhattan have been dismantled or mined, while armed helicopters patrol the rivers.
 buck is then sold to a pair of french-canadian dispatchers from the canadian government, frană§ois and perrault, who take him with them to the klondike region of canada.
 successful, gabriel offers stanley $10â million to program multi-headed worm, a "hydra", to siphon $9.
"""

import re

# Define the regular expression pattern
pattern = r'(\d+)â'  # This pattern matches one or more digits followed by an accented 'a'

# Use re.sub() to perform the replacement
new_text = re.sub(pattern, r'\1', text)

print(new_text)

error: invalid group reference 2 at position 1