In [3]:
import tqdm.notebook as tqdm
import numpy as np
import scipy
import sklearn

# Spark

In [4]:
! /home/jovyan/start-hadoop.sh

jovyan
 * Starting OpenBSD Secure Shell server sshd
start-stop-daemon: unable to set gid to 0 (Operation not permitted)
   ...fail!
 * sshd is running
Starting namenodes on [localhost]
localhost: namenode is running as process 157.  Stop it first and ensure /tmp/hadoop-jovyan-namenode.pid file is empty before retry.
Starting datanodes
localhost: datanode is running as process 282.  Stop it first and ensure /tmp/hadoop-jovyan-datanode.pid file is empty before retry.
Starting secondary namenodes [153d95fbdd73]
153d95fbdd73: secondarynamenode is running as process 506.  Stop it first and ensure /tmp/hadoop-jovyan-secondarynamenode.pid file is empty before retry.
Starting resourcemanager
resourcemanager is running as process 760.  Stop it first and ensure /tmp/hadoop-jovyan-resourcemanager.pid file is empty before retry.
Starting nodemanagers
localhost: nodemanager is running as process 878.  Stop it first and ensure /tmp/hadoop-jovyan-nodemanager.pid file is empty before retry.
8088 org.a

In [5]:
# connect, context, session

import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName='jupyter')

from pyspark.sql import SparkSession, Row
se = SparkSession(sc)

import pyspark.sql.functions as F

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-04-18 19:28:06,665 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


## HDFS

In [6]:
! hdfs dfs -df -h

Filesystem                Size     Used  Available  Use%
hdfs://localhost:9000  196.8 G  363.7 M    146.8 G    0%


In [7]:
! hdfs dfs -ls /

Found 3 items
drwxrwx---   - root   supergroup          0 2023-04-16 13:45 /tmp
drwxr-xr-x   - jovyan supergroup          0 2023-04-16 13:45 /user
drwxr-xr-x   - jovyan supergroup          0 2023-04-16 13:34 /yandex_music


In [8]:
! mkdir -p ~/.kaggle

In [9]:
%%writefile ~/.kaggle/kaggle.json
{"username":"evgpat","key":"1fdbea1ff41ad35641ddfd8be5d711fc"}

Overwriting /home/jovyan/.kaggle/kaggle.json


In [10]:
! chmod 600 ~/.kaggle/kaggle.json

In [11]:
! pip install -U urllib3 kaggle==1.5.3
! kaggle competitions download -c outbrain-click-prediction -f page_views_sample.csv.zip
! kaggle competitions download -c outbrain-click-prediction -f documents_topics.csv.zip 

Collecting urllib3
  Downloading urllib3-1.26.15-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
401 - Unauthorized
401 - Unauthorized


In [12]:
! unzip '*.zip'
! rm -rf *.zip

unzip:  cannot find or open *.zip, *.zip.zip or *.zip.ZIP.

No zipfiles found.


In [13]:
! hdfs dfs -put page_views_sample.csv
! hdfs dfs -put documents_topics.csv

put: `page_views_sample.csv': File exists
put: `documents_topics.csv': File exists


## Evaluation Assignment

Data: outbrain click prediction

Tasks:
Using Spark RDD, DataFrame API and Python, calculate:

**1**. Top 10 most visited document_ids in the page_views_sample log

**2**. How many users have at least 2 different traffic_sources in the page_views_sample log (note the value is not a count, it's an encoded enum)

**3***. Top 10 most visited topic_ids in page_views_sample log (use documents_topics table)

The submission format is the result.json json file with top_10_documents, users and top_10_topics keys.
For TOP-10 results, the answer must be written in the form of a sheet ordered from TOP-1 to TOP-10 with an id.

result.json example:

    {
        "top_10_documents": [
            111,
            222,
            333,
            ...,
            1010
        ],
        "users": 10000,
        "top_10_topics": [
            11,
            22,
            33,
            ...,
            101
        ]
    }

In [14]:
page_views_sample = se.read.csv("page_views_sample.csv", header=True)
documents_topics = se.read.csv("documents_topics.csv", header=True)

                                                                                

## Top 10 most visited document_ids in the page_views_sample log

* we group page_views_sample by `document_id`
* apply count to get the number of views
* apply order by with option `ascending=False`
* limit the result
* wrap the result to rdd

In [15]:
docs = (
    page_views_sample.groupBy("document_id")
    .count()
    .orderBy("count", ascending=False)
    .limit(10)
    .rdd.map(lambda row: int(row.document_id))
    .collect()
)

                                                                                

## How many users have at least 2 different traffic_sources in the page_views_sample log

* we group page_views_sample by `uuid`
* apply aggregation by countDistinct function on `traffic_source`
* then apply filter
* finally, count number of users

In [16]:
users = (
    page_views_sample.groupBy("uuid")
    .agg(F.countDistinct("traffic_source").alias("num_ts"))
    .filter("num_ts > 1")
    .count()
)

                                                                                

## Top 10 most visited topic_ids in page_views_sample log

* we join `page_views_samples` and `documents_topics` on `document_id`
* then group by `topic_id`
* count number of visits per topic
* order by `count`
* limit the result
* wrap this to rdd

In [17]:
topics = (
    page_views_sample.join(documents_topics, on="document_id", how="inner")
    .groupBy("topic_id")
    .count()
    .orderBy("count", ascending=False)
    .limit(10)
    .rdd.map(lambda row: int(row.topic_id))
    .collect()
)

                                                                                

#### Save json

In [18]:
import json

result = {
    "top_10_documents": docs,
    "users": users,
    "top_10_topics": topics,
}

with open("result.json", "w") as f:
    json.dump(result, f)

In [23]:
!curl -F file=@result.json "51.250.54.133:80/MDS-LSML1/m_shishonkov/w2/1"

<!doctype html>
<html lang=en>
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>


In [14]:
! curl -F file=@result.json "51.250.54.133:80/MDS-LSML1/steve_ch_19/w2/1"

curl: (26) Failed to open/read local data from file/application
