<a href="https://colab.research.google.com/github/shandrayu/mining-massive-databases/blob/main/notebooks/wiki_data_fetching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wiki data fetching

In [1]:
!pip install sseclient


Collecting sseclient
  Downloading sseclient-0.0.27.tar.gz (7.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sseclient
  Building wheel for sseclient (setup.py) ... [?25l[?25hdone
  Created wheel for sseclient: filename=sseclient-0.0.27-py3-none-any.whl size=5563 sha256=c883bafcc17329108394c944d0111f0bd29dc9e08f8d40a21136815b4f5f966e
  Stored in directory: /root/.cache/pip/wheels/60/57/0e/09b1264923280e935a34cc543b7f147f5df12490bd7a992f42
Successfully built sseclient
Installing collected packages: sseclient
Successfully installed sseclient-0.0.27


In [2]:
from sseclient import SSEClient as EventSource
import time
import json


def hash_user(wiki, id):
    return hash((wiki, id))

def fetch_users(event_source, fetching_time_sec, message_types, wikis, event_decimation=5):
    observed_users = set()
    skipped_users = set()
    user_counter = 0
    print_log = False
    start_time = time.time()
    for event in event_source:
        try:
            change = json.loads(event.data)
        except:
            # ignore message
            continue

        if change["type"] not in message_types or change["wiki"] not in wikis:
            # ignore message
            continue

        user = hash_user(change["wiki"], change["user"])
        if user in observed_users:
            if print_log:
                print(f"Message from user {user} saved")
            save_user = True
        elif user in skipped_users:
            if print_log:
                print(f"Message from user {user} skipped")
            save_user = False
        else:
            # new user
            if user_counter % event_decimation == 0:
                if print_log:
                    print(f"+ Add new user {user}")
                observed_users.add(user)
                save_user = True
            else:
                if print_log:
                    print(f"- Add Skip user {user}")
                skipped_users.add(user)
                save_user = False
            user_counter += 1

        elapced_time = time.time() - start_time
        if elapced_time > fetching_time_sec:
            break

        if save_user:
            yield (change)

url = 'https://stream.wikimedia.org/v2/stream/recentchange'
message_types = {"edit"}
wikis = {"enwiki", "wikidatawiki"}
wikis_str = "_".join(wikis)
fetching_time_sec = 200
changes = [change for change in fetch_users(EventSource(url), fetching_time_sec, message_types, wikis)]

print(f"Collected {len(changes)} edits")

with open(f"changes_{wikis_str}_{fetching_time_sec}_sec.json", "w", encoding="utf-8") as json_file:
    json.dump(changes, json_file, ensure_ascii=True, indent=4)

Collected 430 edits


## Description

There are several mesage types. We are interested only in type "edit".

Sample 20% of the stream - we record edits from every 5th new user in the stream. Other edits are skipped.

Wiki:
- enwiki
- wikidatawiki