* Denna Notebook [26_aikstats.ipynb](https://github.com/salgo60/ifkdb/blob/main/Notebook/26_aikstats.ipynb)
* [#26](https://github.com/salgo60/ifkdb/issues/26)

* AIK
   * sv:Wikipedia [Kategori:Fotbollsspelare_i_AIK](https://sv.wikipedia.org/wiki/Kategori:Fotbollsspelare_i_AIK)
   * https://www.aikstats.se/player.php
   * Wikidata SPARQL https://w.wiki/J3Rk

In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last run: ", start_time)

Last run:  2026-02-28 17:14:46.597812


In [2]:
import requests
import urllib3

print("requests:", requests.__version__)
print("urllib3:", urllib3.__version__)

requests: 2.32.5
urllib3: 2.6.3


In [3]:
# If needed:
# !pip install requests beautifulsoup4 pandas lxml mwclient tqdm rapidfuzz

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from rapidfuzz import fuzz, process
import mwclient

In [4]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import random
from tqdm import tqdm
import os
import json
import time
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.aikstats.se/player.php?who=all"
PLAYER_URL = "https://www.aikstats.se/player.php?id={}"
HEADERS = {"User-Agent": "Mozilla/5.0 (research bot; contact: salgo60@msn.com)"}

OUTPUT_FILE = "players.json"
FORCE_RESCRAPE = False      # <-- sätt True om du vill köra om allt
DELAY = random.uniform(3, 39)
session = requests.Session()
session.headers.update(HEADERS)


def get_all_player_ids():
    response = session.get(BASE_URL, timeout=15)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")

    ids = set()
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if "player.php?id=" in href:
            try:
                pid = int(href.split("id=")[1])
                ids.add(pid)
            except:
                pass

    return sorted(ids)


def load_existing():
    if os.path.exists(OUTPUT_FILE) and not FORCE_RESCRAPE:
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


def save_data(data):
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def scrape_player(pid):
    url = PLAYER_URL.format(pid)
    r = session.get(url, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    # Minimal exempel – anpassa efter behov
    name = soup.find("h1")
    name = name.get_text(strip=True) if name else None

    return {
        "id": pid,
        "name": name,
        "url": url
    }

# ===============================
# Körning
# ===============================

player_ids = get_all_player_ids()
print(f"Hittade {len(player_ids)} spelare")

data = load_existing()

remaining_ids = [
    pid for pid in player_ids
    if FORCE_RESCRAPE or str(pid) not in data
]

print(f"Återstår att hämta: {len(remaining_ids)}")

for pid in tqdm(remaining_ids, desc="Scraping", unit="player"):

    try:
        player_data = scrape_player(pid)
        data[str(pid)] = player_data
        save_data(data)

        # Slumpad delay PER request (inte global)
        time.sleep(random.uniform(3, 6))

        # Extra paus var 50:e request
        if len(data) % 50 == 0:
            time.sleep(random.uniform(15, 30))

    except Exception as e:
        tqdm.write(f"Fel vid {pid}: {e}")
        time.sleep(15)

print("Klart.")

Hittade 1307 spelare
Återstår att hämta: 1301


Scraping:   9%|██▏                     | 118/1301 [09:57<1:36:23,  4.89s/player]

In [None]:
def get_player_info(pid):
    url = f"{BASE_URL}?id={pid}"
    r = requests.get(url, headers=HEADERS)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")
    
    # Justera selector om strukturen ändras
    h1 = soup.find("h1")
    name = h1.text.strip() if h1 else None
    
    return {
        "id": pid,
        "name": name,
        "url": url
    }

players = []

for pid in tqdm(player_ids):
    try:
        info = get_player_info(pid)
        players.append(info)
        time.sleep(0.5)  # respektfull scraping
    except Exception as e:
        print(f"Fel vid {pid}: {e}")

df_aik = pd.DataFrame(players)
df_aik.head()

In [None]:
site = mwclient.Site("sv.wikipedia.org")

category = site.categories["Fotbollsspelare i AIK"]

wiki_players = []

for page in category.members():
    if page.namespace == 0:  # endast artiklar
        wiki_players.append(page.name)

df_wiki = pd.DataFrame({"wiki_name": wiki_players})
print(f"Hittade {len(df_wiki)} Wikipedia-spelare")
df_wiki.head()

In [None]:
from datetime import datetime
end_time = datetime.now()

duration = end_time - start_time

print("\n===== Körningsrapport =====")
print("Starttid :", start_time.strftime("%Y-%m-%d %H:%M:%S"))
print("Sluttid  :", end_time.strftime("%Y-%m-%d %H:%M:%S"))

total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60

print(f"Körtid   : {hours}h {minutes}m {seconds}s")
print("===========================\n")