<a href="https://colab.research.google.com/github/snipaid-nlg/datasets/blob/main/evaluation/mlsum-dataset-exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLSUM Dataset Exploration

In [None]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from tqdm import tqdm

Loading the dataset from Huggingface

In [None]:
dataset = load_dataset("mlsum", name="de")
dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 220887
    })
    validation: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 11394
    })
    test: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 10701
    })
})

## Quality of Samples
Despite its popularity, issues in the quality of samples have gone unnoticed until early 2022, when Philip May was the first to report on problems.

In [None]:
def is_summary_in_text(summary, text):
    assert len(summary) > 0
    assert len(text) > 0
    return summary in text

In [None]:
def count_summary_in_text(dataset, split_name):
    count = 0
    for item in tqdm(dataset[split_name]):
        text = item["text"]
        summary = item["summary"]
        if is_summary_in_text(summary, text):
            count += 1
    return count

In [None]:
count_summary_in_text(dataset, "train")

100%|██████████| 220887/220887 [00:37<00:00, 5866.94it/s]


126270

In 126,270 (more than half) of them the summary is completely included in the text.

In [None]:
count_summary_in_text(dataset, "validation")

100%|██████████| 11394/11394 [00:01<00:00, 6652.15it/s]


3285

In [None]:
count_summary_in_text(dataset, "test")

100%|██████████| 10701/10701 [00:01<00:00, 6406.87it/s]


3306

## Distribution of topic categories

In [None]:
def count_topic_distribution(dataset, split_name):
    return dict((x,dataset[split_name]["topic"].count(x)) for x in set(dataset[split_name]["topic"]))

In [None]:
count_topic_distribution(dataset, "train")

{'1.2527726?utm_source=FlexiTM': 1,
 'panorama': 21657,
 'sport': 48831,
 'bildung': 2516,
 '1.2699904?SRCmuc=FlexiTM': 1,
 'muenchen': 4698,
 'auto': 6342,
 'karriere': 6340,
 'app': 6,
 'mahjong': 1,
 'service': 819,
 'kultur': 1,
 'politik': 67031,
 'stil': 2705,
 'reise': 7088,
 'digital': 9322,
 'tiananmen': 1,
 'wirtschaft': 35298,
 'thema': 24,
 'reisefuehrer': 49,
 'geld': 8156}

In [None]:
count_topic_distribution(dataset, "test")

{'reise': 218,
 'panorama': 539,
 'digital': 235,
 'auto': 182,
 'sport': 2442,
 'bildung': 127,
 'muenchen': 557,
 'karriere': 124,
 'wirtschaft': 2352,
 'stil': 208,
 'geld': 103,
 'politik': 3614}

In [None]:
count_topic_distribution(dataset, "validation")

{'reise': 204,
 'panorama': 726,
 'digital': 254,
 'auto': 218,
 'sport': 2541,
 'bildung': 111,
 'muenchen': 547,
 'karriere': 149,
 'wirtschaft': 2491,
 'stil': 205,
 'thema': 1,
 'geld': 100,
 'politik': 3847}

## Distribution over publisher

In [None]:
import re

In [None]:
def count_publisher_distribution(dataset, split_name):
  publisher_list = []
  for item in dataset[split_name]:
    url = item["url"]
    m = re.search('https?://([A-Za-z_0-9.-]+).*', url)
    publisher_list.append(m.group(1))
    return set(publisher_list)

In [None]:
count_publisher_distribution(dataset, "train")

{'www.sueddeutsche.de'}

## Length Check

In [None]:
def get_string_length(dataset, split_name, col):
  length_list = [len(item) for item in dataset[split_name][col]]
  length_list.sort()
  low = length_list[0]
  top = length_list[-1]
  average = round(sum(length_list) / len(length_list))
  return low, top, average

In [None]:
get_string_length(dataset, "train", "summary")

(47, 1389, 192)

In [None]:
get_string_length(dataset, "train", "title")

(4, 133, 47)

## Title = Summary?

In [None]:
# test if there are titles as summaries

## Distribution over time



In [None]:
def get_time_distribution(dataset, split_name):
  return set(dataset[split_name]["date"])

In [None]:
get_time_distribution(dataset, "validation")

{'00/01/2019', '00/02/2019', '00/03/2019', '00/04/2019', '00/05/2019'}