In [1]:
!pwd

/Users/rasdani/git/mmteb-wiki/mmteb_wiki


In [1]:
import requests
import os
from datetime import datetime, timedelta
import random
import time
import json

In [None]:

def download_pageviews(year):
    base_url = "https://dumps.wikimedia.org/other/pageviews/"
    start_date = datetime(year, 1, 1)
    end_date = datetime(year + 1, 1, 1)

    current_date = start_date
    while current_date < end_date:
        year_str = current_date.strftime('%Y')
        month_str = current_date.strftime('%m')
        day_str = current_date.strftime('%d')
        
        # Select one random hour
        hour_str = f"{random.randint(0, 23):02}"
        url = f"{base_url}{year_str}/{year_str}-{month_str}/pageviews-{year_str}{month_str}{day_str}-{hour_str}0000.gz"
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            dir_path = f"../data/pageviews/{year_str}/{year_str}-{month_str}"
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            file_path = f"../data/pageviews/{year_str}/{year_str}-{month_str}/pageviews-{year_str}{month_str}{day_str}-{hour_str}0000.gz"
            with open(file_path, 'wb') as f:
                f.write(response.raw.read())
            print(f"Downloaded {file_path}")
        else:
            print(f"Failed to download data for {url}")

        current_date += timedelta(days=1)

# Example usage
download_pageviews(2024)

In [11]:
import gzip
import sys
from collections import Counter, defaultdict
import math
import tqdm
import json


title_views = {}

#Score: Harmonic mean  (View_Day_1 * View_Day_2 * View_day_3)
# Add log for better numerical stabilitiy
# Add +1 to avoid log(0)
# Compare the sum, so that days without view are counted as 0 views
file_paths = ["../data/pageviews/2024/2024-01/pageviews-20240101-090000.gz"]
for filepath in tqdm.tqdm(file_paths):
    with gzip.open(filepath, "rt") as fIn:
        
        with open(filepath.replace(".gz", ".json"), "wt") as fOut:
            fOut.write(json.dumps(title_views))

        for line in fIn:
            splits = line.strip().split()
            if len(splits) == 4:
                lang, title, views, _ = line.strip().split()
                if lang == '""':
                    continue
                lang = lang.lower()

                if lang.endswith(".m"): #Add mobile page scores to main score
                    lang = lang[0:-2]
                
                if lang.count(".") > 0:
                    continue

                if lang not in title_views:
                    title_views[lang] = {}
                if title not in title_views[lang]:
                    title_views[lang][title] = 0.0

                title_views[lang][title] += math.log(int(views)+1)



#Save results
for lang in title_views:
    dir_path = f"../data/pageviews_summary"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    with open(f"{dir_path}/{lang}.json", "w") as fOut:
        fOut.write(json.dumps(title_views[lang]))



100%|██████████| 1/1 [00:04<00:00,  4.97s/it]


In [3]:
# with open("../data/pageviews_summary/de.json") as fIn:
with open("../data/pageviews_summary/final_total_views.json") as fIn:
    pageviews = json.load(fIn)
    pageviews_de = pageviews["de"]

len(pageviews_de), pageviews_de["Deutschland"]



(12880675, 36234.107805126274)

In [4]:
from datasets import load_dataset

ds = load_dataset("rasdani/cohere-wikipedia-2023-11-de", split="train")
ds

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Dataset({
    features: ['_id', 'url', 'title', 'text'],
    num_rows: 20772081
})

In [11]:
pageviews_de["Berlin"]

28742.860991421174

In [5]:
ds_views = ds.map(lambda x: {"views": pageviews_de.get(x["title"], 0)})
ds_views





Map:   0%|          | 0/20772081 [00:00<?, ? examples/s]

Dataset({
    features: ['_id', 'url', 'title', 'text', 'views'],
    num_rows: 20772081
})

In [6]:
ds_views = ds_views.filter(lambda x: x["views"] is not None)
ds_views = ds_views.filter(lambda x: x["views"] > 4)
ds_views

Filter:   0%|          | 0/20772081 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20772081 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [9]:
ds_views[100]

{'_id': '20231101.de_9661014_1',
 'url': 'https://de.wikipedia.org/wiki/Adrian%20Viveash',
 'title': 'Adrian Viveash',
 'text': 'Adrian Viveash wurde 2007 Trainer von Cirencester Town bis ihm im Jahr 2008 angeboten wurde, die Chelsea Akademie zu coachen. Er engagierte sich schnell in den Altersgruppen U15 und U16 und wurde zum Manager der U16 für die Saison 2009/10 befördert. Er arbeitete auch eng mit U18 Manager Dermot Drummy zusammen.',
 'views': 0.0}

In [12]:
ds_views_sorted = ds_views.sort("views", reverse=True)
ds_views_sorted[:5]


{'_id': ['20231101.de_15751_0',
  '20231101.de_15751_1',
  '20231101.de_15751_2',
  '20231101.de_15751_3',
  '20231101.de_15751_4'],
 'url': ['https://de.wikipedia.org/wiki/Neujahr',
  'https://de.wikipedia.org/wiki/Neujahr',
  'https://de.wikipedia.org/wiki/Neujahr',
  'https://de.wikipedia.org/wiki/Neujahr',
  'https://de.wikipedia.org/wiki/Neujahr'],
 'title': ['Neujahr', 'Neujahr', 'Neujahr', 'Neujahr', 'Neujahr'],
 'text': ['Neujahr (auch Neujahrstag) ist der erste Tag des Kalenderjahres. Wegen der teils in einzelnen Kulturen und Religionen unterschiedlichen Zeitrechnungen und damit auch Kalender ist der Jahresbeginn zu unterschiedlichen Zeitpunkten. In nahezu allen Kulturen ist mit ihm ein Neujahrsfest mit dazugehörigen Bräuchen verbunden, oft ist er ein Feiertag. Die Wahl, auf welchen Tag der 1.\xa0Tag des Kalendersystems fällt, wird Kalenderstil genannt.',
  'Auf astronomischer Basis sind Neujahrsdaten für sonnengebundene Kalendersysteme (Solarkalender und Lunisolarkalender), d

In [16]:
ds_views_top100k = ds_views_sorted.select(range(100000))
ds_views_top100k


Dataset({
    features: ['_id', 'url', 'title', 'text', 'views'],
    num_rows: 100000
})

In [17]:
ds_views_top100k.push_to_hub("rasdani/cohere-wikipedia-2023-11-de-top100k-views")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rasdani/cohere-wikipedia-2023-11-de-top100k-views/commit/844184b8413d099a15de9d20b2f1ef47eef810c7', commit_message='Upload dataset', commit_description='', oid='844184b8413d099a15de9d20b2f1ef47eef810c7', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
import pandas as pd
df = ds_views_top100k.to_pandas()
title_counts = df['title'].value_counts()
title_counts


title
Augsburg              471
Russland              458
Schweiz               417
Düsseldorf            400
Pazifikkrieg          385
                     ... 
Bibiza                  2
Kolleginnen             2
XXX                     1
Ayliva/Diskografie      1
Repression              1
Name: count, Length: 1138, dtype: int64

In [30]:
df['title'][130:140]



130    Silvester
131    Silvester
132    Silvester
133    Silvester
134    Silvester
135    Silvester
136    Silvester
137    Silvester
138    Silvester
139    Silvester
Name: title, dtype: object

In [22]:
title_counts[-20:]

title
Büchelkühn             5
Tream                  5
Polestar               5
Leony                  5
Grossstadtgeflüster    4
Knallerbse             4
Oliebol                4
2024                   4
Etsy                   4
4images                3
Grappa                 3
UTC−12                 3
Xnxx.com               3
Holz-Drache            3
Sylvester              2
Bibiza                 2
Kolleginnen            2
XXX                    1
Ayliva/Diskografie     1
Repression             1
Name: count, dtype: int64

In [34]:
collapsed_df = df.groupby('title').size()
collapsed_df


title
1924                   102
1I/ʻOumuamua            29
2-Euro-Gedenkmünzen     40
2023                    86
2024                     4
                      ... 
Äthiopien              168
Österreich             370
Österreich-Ungarn      185
Ötzi                    96
Übersetzer              20
Length: 1138, dtype: int64

In [38]:
collapsed_df.loc['Schweiz']


417

In [8]:
dir_path = "../data/pageviews_summary/2015"

# Iterate through all files in the directory

filenames = sorted(os.listdir(dir_path))
filenames = [f for f in filenames if f.endswith('.json')]
first = filenames[0]
file_path = os.path.join(dir_path, first)
with open(file_path, 'r') as file:
    total_views = json.load(file)
    print(list(total_views["de"].items())[100])
for filename in filenames[1:]:
    file_path = os.path.join(dir_path, filename)
    print(file_path)
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    for title, views in data["de"].items():
        total_views["de"][title] = total_views["de"].get(title, 0) + views
    print(list(total_views["de"].items())[100])





('00_Schneider_–_Jagd_auf_Nihil_Baxter', 25.7529347097556)
../data/pageviews_summary/2015/06.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 47.30616434163127)
../data/pageviews_summary/2015/07.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 70.3716681374874)
../data/pageviews_summary/2015/08.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 97.37736581573837)
../data/pageviews_summary/2015/09.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 122.5549363805904)
../data/pageviews_summary/2015/10.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 150.5050956676822)
../data/pageviews_summary/2015/11.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 183.11491332604618)
../data/pageviews_summary/2015/12.json
('00_Schneider_–_Jagd_auf_Nihil_Baxter', 215.62942080460584)
