In [3]:
from datasets import Dataset, load_dataset
import random

In [4]:
# ds = load_dataset("rasdani/cohere-wikipedia-2023-11-de-top100k-views", split="train")
ds = load_dataset("rasdani/cohere-wikipedia-2023-11-de-top200k-views", split="train")
ds

Downloading readme:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 57.7M/57.7M [00:02<00:00, 22.8MB/s]


Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Dataset({
    features: ['_id', 'url', 'title', 'text', 'views'],
    num_rows: 200000
})

In [5]:
# Convert dataset to pandas DataFrame
df = ds.to_pandas()

# Group by 'title' and filter groups that have 5 or more paragraphs
grouped_df = df.groupby('title', sort=False).size().reset_index(name='count')
filtered_df = grouped_df[grouped_df['count'] >= 5]

# Display the filtered DataFrame
filtered_df


Unnamed: 0,title,count
0,Deutschland,245
1,Österreich,370
2,Schweiz,417
3,Berlin,282
4,XHamster,35
...,...,...
2308,Kommunikation,63
2309,Mangold,13
2310,Kosaken,67
2311,Eiger-Nordwand,99


In [6]:
ds_filtered = ds.filter(lambda x: x['title'] in filtered_df['title'].tolist())
ds_filtered

Filter:   0%|          | 0/200000 [00:00<?, ? examples/s]

Dataset({
    features: ['_id', 'url', 'title', 'text', 'views'],
    num_rows: 199986
})

In [7]:
len(set(ds["title"]))

2313

In [8]:
seed = int.from_bytes(b"ellamind") % (2**32)
random.seed(seed)

In [9]:
# Define a function to select a random window of 5 consecutive paragraphs
def select_random_window(group):
    if len(group) >= 5:
        start_index = random.randint(0, len(group) - 5)
        return group.iloc[start_index:start_index + 5]
    return None

# Group by 'title', apply the function, and drop groups with less than 5 paragraphs
grouped_df = df.groupby('title', sort=False).apply(select_random_window).dropna()

# Convert the filtered DataFrame back to a dataset
filtered_ds = Dataset.from_pandas(grouped_df.reset_index(drop=True))

# Display the filtered dataset
filtered_ds

  grouped_df = df.groupby('title', sort=False).apply(select_random_window).dropna()


Dataset({
    features: ['_id', 'url', 'title', 'text', 'views'],
    num_rows: 11540
})

In [10]:
# Convert dataset to pandas DataFrame
df = filtered_ds.to_pandas()

# Define a function to select a window of 5 consecutive paragraphs and assign scores
def assign_scores(group):
    if len(group) >= 5:
        scores = [0.5, 0.5, 1.0, 0.5, 0.5]  # Scores for the paragraphs
        group['score'] = scores
        return group
    return None

# Group by 'title', apply the function, and drop groups with less than 5 paragraphs
grouped_df = df.groupby('title', sort=False).apply(assign_scores).dropna().reset_index(drop=True)

# Display the DataFrame with the new 'score' column
grouped_df

  grouped_df = df.groupby('title', sort=False).apply(assign_scores).dropna().reset_index(drop=True)


Unnamed: 0,_id,url,title,text,views,score
0,20231101.de_1497705_123,https://de.wikipedia.org/wiki/Deutschland,Deutschland,Der Bundestag machte 1991 Berlin zur Hauptstad...,36234.107805,0.5
1,20231101.de_1497705_124,https://de.wikipedia.org/wiki/Deutschland,Deutschland,Bei der Bundestagswahl 1998 verlor Kohls schwa...,36234.107805,0.5
2,20231101.de_1497705_125,https://de.wikipedia.org/wiki/Deutschland,Deutschland,Die zweite Amtszeit Schröders ab 2002 war von ...,36234.107805,1.0
3,20231101.de_1497705_126,https://de.wikipedia.org/wiki/Deutschland,Deutschland,Angela Merkel beendete die letzte ihrer vier A...,36234.107805,0.5
4,20231101.de_1497705_127,https://de.wikipedia.org/wiki/Deutschland,Deutschland,Der russische Überfall auf die Ukraine 2022 fü...,36234.107805,0.5
...,...,...,...,...,...,...
11535,20231101.de_33549_93,https://de.wikipedia.org/wiki/Villingen-Schwen...,Villingen-Schwenningen,"Mit den Gemeinden Brigachtal, Dauchingen, Mönc...",12652.823342,0.5
11536,20231101.de_33549_94,https://de.wikipedia.org/wiki/Villingen-Schwen...,Villingen-Schwenningen,Nach der Kommunalabfrage 2007 des Bundes der S...,12652.823342,0.5
11537,20231101.de_33549_95,https://de.wikipedia.org/wiki/Villingen-Schwen...,Villingen-Schwenningen,"2006 insgesamt 111,3 Millionen Euro Schulden (...",12652.823342,1.0
11538,20231101.de_33549_96,https://de.wikipedia.org/wiki/Villingen-Schwen...,Villingen-Schwenningen,"2007 insgesamt 122,3 Millionen Euro Schulden (...",12652.823342,0.5


In [11]:
ds_upload = Dataset.from_pandas(grouped_df)
ds_upload



Dataset({
    features: ['_id', 'url', 'title', 'text', 'views', 'score'],
    num_rows: 11540
})

In [12]:
ds_upload[4]

{'_id': '20231101.de_1497705_127',
 'url': 'https://de.wikipedia.org/wiki/Deutschland',
 'title': 'Deutschland',
 'text': 'Der russische Überfall auf die Ukraine 2022 führte zu umfassenden Wirtschaftssanktionen des Westens gegen Russland, an denen sich auch Deutschland beteiligte. Unter anderem stoppte Deutschland die Inbetriebnahme der Gaspipeline Nord Stream 2. Die deutsche Wirtschaft, die sich von russischem Gas abhängig gemacht hatte, musste eine starke Teuerung im Energiesektor hinnehmen. Deutschland unterstützte die Ukraine innerhalb des ersten halben Jahres mit Waffen im Wert von mehreren Millionen Euro sowie der Ausbildung ukrainischer Soldaten.',
 'views': 36234.107805126274,
 'score': 0.5}

In [13]:
ds_upload.push_to_hub("rasdani/cohere-wikipedia-2023-11-de-scores")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rasdani/cohere-wikipedia-2023-11-de-scores/commit/fb01311a1fb32839f660c3b75fee3a73c6fb8745', commit_message='Upload dataset', commit_description='', oid='fb01311a1fb32839f660c3b75fee3a73c6fb8745', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
ds_positives = ds_upload.filter(lambda x: x["score"] == 1.0)
ds_positives

Filter:   0%|          | 0/11540 [00:00<?, ? examples/s]

Dataset({
    features: ['_id', 'url', 'title', 'text', 'views', 'score'],
    num_rows: 2308
})

In [15]:
ds_positives.push_to_hub("rasdani/cohere-wikipedia-2023-11-de-positives")



Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rasdani/cohere-wikipedia-2023-11-de-positives/commit/02a3200af2aae82f4eb6302691fecfa2485f3b1c', commit_message='Upload dataset', commit_description='', oid='02a3200af2aae82f4eb6302691fecfa2485f3b1c', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
ds_positives.shuffle(seed=seed).select(range(200)).push_to_hub("rasdani/cohere-wikipedia-2023-11-de-positives-debug")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/460 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rasdani/cohere-wikipedia-2023-11-de-positives-debug/commit/77b536fc4f30725d85035472b5cdfd2fb4f1482b', commit_message='Upload dataset', commit_description='', oid='77b536fc4f30725d85035472b5cdfd2fb4f1482b', pr_url=None, pr_revision=None, pr_num=None)