In [2]:
from datasets import load_dataset

# Load a dataset in streaming mode
dataset = load_dataset('wikipedia', '20220301.en', streaming=True)

# Example: Iterate over the dataset and print only the first 5 samples
print("Streaming dataset samples:")
for i, sample in enumerate(dataset['train']):
    print(f"Title: {sample['title']}\nText snippet: {sample['text'][:500]}...\n")
    if i >= 4:  # Only print the first 5 articles
        break


Streaming dataset samples:
Title: Anarchism
Text snippet: Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong...

Title: Autism
Text snippet: Autism is a neurodevelopmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents often notice signs during the first three years of their child's life. These signs often develop gradually, though some autistic children experience regression in their communication and social skills after reaching developmental milestones at a

In [3]:
from datasets import load_dataset
import time

# Load the Wikipedia dataset, specifying the English version for 2020-03
# This dataset is large and widely used for many NLP tasks
print("Loading the Wikipedia dataset...")
start_time = time.time()
dataset = load_dataset('wikipedia', '20220301.en')
print(f"Dataset loaded in {time.time() - start_time:.2f} seconds.")

# The Wikipedia dataset has several features, but we'll focus on the text
print("\nInitial sample from the dataset:")
print(dataset['train'][0]['text'][:500])  # Print the first 500 characters of the first article

# Let's say we want to filter articles that mention "artificial intelligence"
print("\nFiltering articles that mention 'artificial intelligence'...")
filtered_articles = dataset['train'].filter(lambda example: "artificial intelligence" in example['text'].lower())
print(f"Found {len(filtered_articles)} articles mentioning 'artificial intelligence'.")

# Now, perform a simple analysis: count the average length of these articles
total_length = sum(len(article['text']) for article in filtered_articles)
average_length = total_length / len(filtered_articles)
print(f"\nAverage length of articles mentioning 'artificial intelligence': {average_length:.2f} characters.")

# Example of more complex manipulation - compute the frequency of words in the filtered articles
from collections import Counter

word_counts = Counter()
for article in filtered_articles:
    words = article['text'].lower().split()
    word_counts.update(words)

print("\nMost common words in articles about 'artificial intelligence':")
for word, freq in word_counts.most_common(10):
    print(f"{word}: {freq}")

# Cleaning up resources (if necessary, e.g., saving filtered data)
# filtered_articles.save_to_disk('/path/to/save')  # Uncomment to save the filtered dataset


Loading the Wikipedia dataset...


Downloading data: 100%|██████████| 41/41 [17:26<00:00, 25.52s/files]    
Generating train split: 100%|██████████| 6458670/6458670 [02:23<00:00, 45141.63 examples/s]


Dataset loaded in 1204.52 seconds.

Initial sample from the dataset:
Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong

Filtering articles that mention 'artificial intelligence'...


Filter: 100%|██████████| 6458670/6458670 [04:24<00:00, 24395.46 examples/s]


Found 10800 articles mentioning 'artificial intelligence'.

Average length of articles mentioning 'artificial intelligence': 12753.35 characters.

Most common words in articles about 'artificial intelligence':
the: 1292561
of: 718071
and: 647243
in: 500756
to: 470172
a: 428683
is: 206518
for: 196077
as: 178468
that: 163385
