In [32]:
from datasets import load_dataset, concatenate_datasets

In [10]:
def normalise(data):
    # Lowercase the text
    data["source"] = data["source"].lower()
    data["target"] = data["target"].lower()

    # Remove new line characters
    data["source"] = data["source"].replace("\n", " ")

    return data


In [11]:
narrative_data = (
    load_dataset("narrativeqa", trust_remote_code=True)
    .select_columns(["document", "question"])
    .map(
        lambda x: {
            "document": x["document"]["summary"]["text"],
            "question": x["question"]["text"],
        }
    )
    .rename_columns({"document": "source", "question": "target"})
    .map(normalise)
)

Map:   0%|          | 0/32747 [00:00<?, ? examples/s]

Map:   0%|          | 0/10557 [00:00<?, ? examples/s]

Map:   0%|          | 0/3461 [00:00<?, ? examples/s]

## Observed character encoding issues
- â
- â â
- â
- â
- â
- â
- â˛
- ă
- âł
- â
- ĺ

In total, there are 11 instances of encoding issues in the NarrativeQA summaries.

In [12]:
issue_1 = narrative_data.filter(lambda x: "â" in x["source"])
issue_1

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 3371
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 1085
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 276
    })
})

In [13]:
issue_2 = narrative_data.filter(lambda x: "â â" in x["source"])
issue_2

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 89
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 58
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [29]:
issue_3 = narrative_data.filter(lambda x: "â" in x["source"])
issue_3

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1932
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 627
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 235
    })
})

In [17]:
issue_4 = narrative_data.filter(lambda x: "â" in x["source"])
issue_4

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1088
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 233
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 59
    })
})

In [19]:
issue_5 = narrative_data.filter(lambda x: "â" in x["source"])
issue_5

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 148
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 119
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [20]:
issue_6 = narrative_data.filter(lambda x: "â" in x["source"])
issue_6

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 145
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 90
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [21]:
issue_7 = narrative_data.filter(lambda x: "â˛" in x["source"])
issue_7

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 30
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [22]:
issue_8 = narrative_data.filter(lambda x: "ă" in x["source"])
issue_8

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [23]:
issue_9 = narrative_data.filter(lambda x: "âł" in x["source"])
issue_9

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 491
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 89
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
})

In [41]:
issue_10 = narrative_data.filter(lambda x: "â" in x["source"])
issue_10

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 30
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [43]:
issue_11 = narrative_data.filter(lambda x: "ĺ" in x["source"])
issue_11

Filter:   0%|          | 0/32747 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10557 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3461 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 60
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 29
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 0
    })
})

In [49]:
for i, issue in enumerate([issue_1, issue_2, issue_3, issue_4, issue_5, issue_6, issue_7, issue_8, issue_9, issue_10, issue_11]):
    concat_dataset = concatenate_datasets(
            [issue["train"], issue["test"], issue["validation"]]
        )
    examples = concat_dataset.shuffle(seed=1).select(range(3))

    print(f"Issue {i+1}:\n")
    for example in examples:
        print([sentence + '.' for sentence in example["source"].split('.') if 'â' in sentence or 'ă' in sentence or 'ĺ' in sentence][0])
    print('\n')


Issue 1:

 he assembles six men from his companyât/sgt.
 mason contacts o'malley, who supplies him with weapons and tells him that his son is still aliveâo'malley adopted mason's son and sent him to a private school so that he would be out of danger.
 working for different companies, the "tin men" are prepared to do almost anythingâlegal or illegalâto close a sale.


Issue 2:

 ash eventually finds the real one and attempts to say the magic phrase that will allow him to remove the book safelyâ â "klaatu barada nikto".
 there is still left the consolation that a happy end would come for humanity as a wholeâ â though hundreds of years too late for avis and ernest as individuals; the cruel oligarchy would fall, and the two will be vindicated and respected by posterity as pioneers and martyrs.
 ash eventually finds the real one and attempts to say the magic phrase that will allow him to remove the book safelyâ â "klaatu barada nikto".


Issue 3:

 a group of college friends c

Looking at the output of the examples for each issue, we should replace the characters with:

1. ', '
2. ' -'
3. '-'
4. "'"
5. ''
6. ''
7. ''
8. 'é'
9. '$'
10. ''
11. 'ō'