In [60]:
import pyspark
from pyspark.sql import SparkSession
import urllib.request
from urllib.parse import unquote, urlparse
import os
import shutil
import requests
import datetime
import traceback
from pathlib import Path, PurePosixPath
from typing import List, Iterable, Optional, Union

PathLike = Union[str, os.PathLike]

In [61]:
spark = SparkSession.builder.appName('EDA').getOrCreate()
sc = spark.sparkContext

In [62]:
def wikimedia_url(date: datetime.datetime) -> str:
    year = str(date.year)
    month = str(date.month).zfill(2)
    day = str(date.day).zfill(2)
    hour = str(date.hour).zfill(2)
    file = f"{year}/{year}-{month}/pageviews-{year}{month}{day}-{hour}0000.gz"
    return f"https://dumps.wikimedia.org/other/pageviews/{file}"

def wikimedia_local_file(date: datetime.datetime) -> str:
    parsed_url = urlparse(wikimedia_url(date))
    parsed_path = PurePosixPath(unquote(parsed_url.path))
    filename_parts = parsed_path.parts[-3:]
    return filename_parts

for d, url, path in [
    (
        datetime.datetime(2021, 1, 1, hour=8),
        "https://dumps.wikimedia.org/other/pageviews/2021/2021-01/pageviews-20210101-080000.gz",
        ("2021", "2021-01", "pageviews-20210101-080000.gz"),
    ),
    (
        datetime.datetime(2021, 1, 16, hour=14),
        "https://dumps.wikimedia.org/other/pageviews/2021/2021-01/pageviews-20210116-140000.gz",
        ("2021", "2021-01", "pageviews-20210116-140000.gz"),
    ),
]:
    assert wikimedia_url(d) == url
    assert wikimedia_local_file(d) == path

In [63]:
def datetime_range(
    start: datetime.datetime,
    end: datetime.datetime,
    interval: Optional[datetime.timedelta] = None,
) -> Iterable[datetime.datetime]:
    iv = interval or datetime.timedelta(hours=1)
    current = start
    yield current
    while current < end:
        current += iv
        yield current

def wikimedia_files(
    dates: Iterable[datetime.datetime]
) -> Iterable[str]:
    return zip(dates, map(wikimedia_url, dates))

date_range = list(datetime_range(
    datetime.datetime(2021, 1, 1, hour=8),
    datetime.datetime(2021, 1, 1, hour=12)
))
assert len(date_range) == 5 # from 8 to 12 there are 5 hours
assert list([url for _, url in wikimedia_files(date_range)])[:2] == [
    'https://dumps.wikimedia.org/other/pageviews/2021/2021-01/pageviews-20210101-080000.gz',
    'https://dumps.wikimedia.org/other/pageviews/2021/2021-01/pageviews-20210101-090000.gz'
]

In [64]:
def download_wikimedia_file(url: str, destination: PathLike) -> PathLike:
    if Path(destination).exists():
        # skip download
        return destination
    
    # make sure the directory exists
    Path(destination).parent.mkdir(parents=True, exist_ok=True)
    
    # download the file
    try:
        with requests.get(url, allow_redirects=True) as data, open(destination, "wb") as out_file:
            out_file.write(data.content)
    except Exception as e:
        print(f"failed to download {url}: {e}")
        print(traceback.format_exc())
    return destination

In [65]:
def download_handler(item, path):
    date, url = item
    filename = "/".join(wikimedia_local_file(date))
    destination = path / filename
    print(f"downloading {destination}")
    return date, download_wikimedia_file(url, destination=destination)

In [66]:
# test download locally
if False:
    for item in wikimedia_files(date_range):
        download_handler(item)

In [70]:
# test download using spark in parallel
date_range = list(datetime_range(
    datetime.datetime(2019, 1, 1, hour=0),
    datetime.datetime(2020, 1, 1, hour=0)
))
assert len(date_range) == 365 * 24 + 1

8761 8760


In [None]:
# Path("/dbfs/mnt/group29")
from functools import partial
downloaded = sc.parallelize(wikimedia_files(date_range)) \
    .map(partial(download_handler, path=Path("./wikimedia_data"))) \
    .collect()

In [58]:
# stop any running spark processes
sc.stop()

downloading wikimedia_data/2019/2019-01/pageviews-20190102-110000.gz
21/10/02 06:30:20 ERROR Executor: Exception in task 2.0 in stage 33.0 (TID 104): Connection reset
21/10/02 06:30:20 ERROR Executor: Exception in task 3.0 in stage 33.0 (TID 105): Connection reset
21/10/02 06:30:20 ERROR Executor: Exception in task 1.0 in stage 33.0 (TID 103): Connection reset
21/10/02 06:30:20 ERROR Executor: Exception in task 0.0 in stage 33.0 (TID 102): Connection reset


AttributeError: 'SparkContext' object has no attribute 'start'

In [50]:
csv_loader = spark.read.format("csv") \
            .option("sep", ' ')
           #.option("header", "true") \
           #.option("delimiter", "|") \
           #.option("inferschema", "true")
for date, file in downloaded:
    df = csv_loader.load(str(file))
    df.show(1)
    break

+---+---------+---+---+
|_c0|      _c1|_c2|_c3|
+---+---------+---+---+
| aa|Main_Page|  5|  0|
+---+---------+---+---+
only showing top 1 row

+---+---+---+---+
|_c0|_c1|_c2|_c3|
+---+---+---+---+
| aa|  -| 10|  0|
+---+---+---+---+
only showing top 1 row

+---+---------+---+---+
|_c0|      _c1|_c2|_c3|
+---+---------+---+---+
| aa|Main_Page|  4|  0|
+---+---------+---+---+
only showing top 1 row

+---+---+---+---+
|_c0|_c1|_c2|_c3|
+---+---+---+---+
| aa|  -|  3|  0|
+---+---+---+---+
only showing top 1 row

+---+---------+---+---+
|_c0|      _c1|_c2|_c3|
+---+---------+---+---+
| aa|Main_Page| 48|  0|
+---+---------+---+---+
only showing top 1 row



In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("domain", StringType(), True),
    StructField("pagename", StringType(), True),
    StructField("count", StringType(), True),
   StructField("responsebytes", StringType(), True)
])


df = spark.read.option("sep"," ").csv("/mnt/group29/test.gz", schema=schema)
df.show(1)