In [9]:
import json

with open('qid_0-1000000.json', 'r') as f:
    j0 = json.load(f)


In [10]:
j0[-32]

{'item': 'http://www.wikidata.org/entity/Q1787900',
 'article': 'https://de.wikipedia.org/wiki/Ro%C3%9Fbach_(Pfalz)'}

In [11]:
with open('qid_1000000-2000000.json', 'r') as f:
    j1 = json.load(f)

In [12]:
j1[-100]

{'item': 'http://www.wikidata.org/entity/Q1024796',
 'article': 'https://de.wikipedia.org/wiki/Municipio_Cabezas'}

In [13]:
qids = j0 + j1
len(qids)



3000000

In [14]:
from datasets import Dataset

# Convert the list of dictionaries into a Hugging Face dataset
# Ensure the dataset has 'item' and 'article' as features
ds = Dataset.from_dict({"item": [q["item"] for q in qids], "article": [q["article"] for q in qids]})



In [15]:
ds[0]

{'item': 'http://www.wikidata.org/entity/Q51',
 'article': 'https://de.wikipedia.org/wiki/Antarktika'}

In [16]:
def extract_qid(item_url):
    # Extract the QID from the end of the item URL
    qid = item_url.split("/")[-1]
    return qid

# Apply the function to map 'item' to its QID for each entry in the dataset
ds = ds.map(lambda example: {"item": extract_qid(example["item"])})


Map: 100%|██████████| 3000000/3000000 [00:32<00:00, 90912.50 examples/s]


In [17]:
ds

Dataset({
    features: ['item', 'article'],
    num_rows: 3000000
})

In [18]:
ds = ds.rename_column("article", "url")


In [19]:
ds = ds.rename_column("item", "qid")

In [20]:
ds[0]

{'qid': 'Q51', 'url': 'https://de.wikipedia.org/wiki/Antarktika'}

In [21]:
from collections import Counter

# Count duplicate entries based on 'qid'
qid_counts = Counter([entry['qid'] for entry in ds])

# Filter out unique entries for demonstration
# duplicates = {qid: count for qid, count in qid_counts.items() if count > 1}

# print(f"Total duplicate entries: {len(duplicates)}")
# for qid, count in duplicates.items():
#     print(f"QID: {qid}, Count: {count}")


In [22]:
# Remove duplicate entries based on 'qid'
ds = ds.filter(lambda example: qid_counts[example['qid']] == 1)
ds

Filter: 100%|██████████| 3000000/3000000 [00:03<00:00, 810270.98 examples/s]


Dataset({
    features: ['qid', 'url'],
    num_rows: 2998208
})

In [23]:
import pandas as pd
from datasets import Dataset

# Load the TSV file as a Pandas DataFrame
df = pd.read_csv("2024-02-06.allwiki.links.rank", sep="\t", header=None, names=["qid", "rank"])

# Convert the DataFrame to a Hugging Face Dataset
ranks = Dataset.from_pandas(df)
ranks

Dataset({
    features: ['qid', 'rank'],
    num_rows: 24600464
})

In [24]:
ranks[0]

{'qid': 'Q4048908', 'rank': 154115.4560145852}

In [25]:
# Convert 'ds' dataset to a pandas DataFrame
ds_df = ds.to_pandas()

# Perform the left join of 'ds_df' with 'ranks' on 'qid'
merged_df = ds_df.merge(ranks.to_pandas(), how='left', on='qid')

# Convert the merged DataFrame back to a Hugging Face Dataset
ds = Dataset.from_pandas(merged_df)
ds


Dataset({
    features: ['qid', 'url', 'rank'],
    num_rows: 2998208
})

In [26]:
ds[0]

{'qid': 'Q51',
 'url': 'https://de.wikipedia.org/wiki/Antarktika',
 'rank': 2653.8168792123374}

In [27]:
ds = ds.sort("rank", reverse=True)

In [28]:
ds = ds.select(range(20000))



In [29]:
import urllib.parse

# Function to normalize URLs
def normalize_url(url):
    # Parse the URL to components
    parsed_url = urllib.parse.urlparse(url)
    # Normalize the path component
    normalized_path = urllib.parse.quote(urllib.parse.unquote(parsed_url.path))
    # Reconstruct the URL with the normalized path
    normalized_url = urllib.parse.urlunparse(
        (parsed_url.scheme, parsed_url.netloc, normalized_path, parsed_url.params, parsed_url.query, parsed_url.fragment)
    )
    return normalized_url

# Apply URL normalization to the 'url' column in the dataset
ds = ds.map(lambda example: {"url": normalize_url(example["url"])})


Map: 100%|██████████| 20000/20000 [00:00<00:00, 37574.13 examples/s]


In [30]:
ds[:5]

{'qid': ['Q4048908', 'Q565', 'Q1860', 'Q30', 'Q22664'],
 'url': ['https://de.wikipedia.org/wiki/Kategorie%3AKategorie%3AVersteckt',
  'https://de.wikipedia.org/wiki/Wikimedia_Commons',
  'https://de.wikipedia.org/wiki/Englische_Sprache',
  'https://de.wikipedia.org/wiki/Vereinigte_Staaten',
  'https://de.wikipedia.org/wiki/Geographische_Koordinaten'],
 'rank': [154115.4560145852,
  113863.52443551894,
  48414.26794242895,
  47926.79854769398,
  39933.91216185596]}

In [31]:
# from datasets import load_dataset

# wiki = load_dataset("wikimedia/wikipedia", "20231101.de", split="train")


In [32]:
article_to_info = {row['url']: {'qid': row['qid'], 'rank': row['rank']} for row in ds}

In [33]:
with open('url_to_rank.json', 'w') as f:
    json.dump(article_to_info, f)