In [1]:
import tqdm.notebook as tqdm
import numpy as np
import scipy
import sklearn

# Spark

In [2]:
! /home/jovyan/start-hadoop.sh

jovyan
 * Starting OpenBSD Secure Shell server sshd
start-stop-daemon: unable to set gid to 0 (Operation not permitted)
   ...fail!
 * sshd is running
Starting namenodes on [localhost]
localhost: namenode is running as process 163.  Stop it first and ensure /tmp/hadoop-jovyan-namenode.pid file is empty before retry.
Starting datanodes
localhost: datanode is running as process 273.  Stop it first and ensure /tmp/hadoop-jovyan-datanode.pid file is empty before retry.
Starting secondary namenodes [bdfdecdf0194]
bdfdecdf0194: secondarynamenode is running as process 483.  Stop it first and ensure /tmp/hadoop-jovyan-secondarynamenode.pid file is empty before retry.
Starting resourcemanager
resourcemanager is running as process 724.  Stop it first and ensure /tmp/hadoop-jovyan-resourcemanager.pid file is empty before retry.
Starting nodemanagers
localhost: nodemanager is running as process 835.  Stop it first and ensure /tmp/hadoop-jovyan-nodemanager.pid file is empty before retry.
273 org.ap

In [3]:
# connect, context, session

import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName='jupyter')

from pyspark.sql import SparkSession, Row
se = SparkSession(sc)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-03-30 22:37:41,893 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


## HDFS

In [4]:
! hdfs dfs -df -h

Filesystem                 Size     Used  Available  Use%
hdfs://localhost:9000  1006.9 G  291.6 M    939.9 G    0%


In [5]:
! hdfs dfs -ls /

Found 2 items
drwxrwx---   - root   supergroup          0 2023-03-30 22:38 /tmp
drwxr-xr-x   - jovyan supergroup          0 2023-03-30 22:37 /user


In [6]:
! mkdir -p ~/.kaggle

In [12]:
%%writefile ~/.kaggle/kaggle.json
{"username":"vadrad","key":"3f94f2ff67bffb0edd6ee239ecbd10d7"}

Overwriting /home/jovyan/.kaggle/kaggle.json


In [13]:
! chmod 600 ~/.kaggle/kaggle.json

In [14]:
! pip install -U urllib3 kaggle==1.5.3
! kaggle competitions download -c outbrain-click-prediction -f page_views_sample.csv.zip
! kaggle competitions download -c outbrain-click-prediction -f documents_topics.csv.zip 

Collecting urllib3
  Downloading urllib3-1.26.15-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 kB[0m [31m20.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading page_views_sample.csv.zip to /home/jovyan/work
100%|████████████████████████████████████████| 149M/149M [00:23<00:00, 7.72MB/s]
100%|████████████████████████████████████████| 149M/149M [00:23<00:00, 6.56MB/s]
Downloading documents_topics.csv.zip to /home/jovyan/work
100%|████████████████████████████████████████| 121M/121M [00:19<00:00, 6.29MB/s]
100%|████████████████████████████████████████| 121M/121M [00:19<00:00, 6.64MB/s]


In [15]:
! unzip '*.zip'
! rm -rf *.zip

Archive:  page_views_sample.csv.zip
  inflating: page_views_sample.csv   

Archive:  documents_topics.csv.zip
  inflating: documents_topics.csv    

2 archives were successfully processed.


In [16]:
! hdfs dfs -put page_views_sample.csv
! hdfs dfs -put documents_topics.csv

## Evaluation Assignment

Data: outbrain click prediction

Tasks:
Using Spark RDD, DataFrame API and Python, calculate:

**1**. Top 10 most visited document_ids in the page_views_sample log

**2**. How many users have at least 2 different traffic_sources in the page_views_sample log (note the value is not a count, it's an encoded enum)

**3***. Top 10 most visited topic_ids in page_views_sample log (use documents_topics table)

The submission format is the result.json json file with top_10_documents, users and top_10_topics keys.
For TOP-10 results, the answer must be written in the form of a sheet ordered from TOP-1 to TOP-10 with an id.

result.json example:

    {
        "top_10_documents": [
            111,
            222,
            333,
            ...,
            1010
        ],
        "users": 10000,
        "top_10_topics": [
            11,
            22,
            33,
            ...,
            101
        ]
    }

### DataFrame API solution

Let's try to use the DataFrame API first, since it's more high level - it should be easier.

In [17]:
page_views_sample = se.read.csv("page_views_sample.csv", header=True)
documents_topics = se.read.csv("documents_topics.csv", header=True)

                                                                                

### 1. Top 10 documents

To get 10 most popular documents we can just group the df by document id, count, sort them by count and show the head 10 results.

In [19]:
top_10_documents = (
    page_views_sample.groupBy("document_id")
    .count()
    .orderBy("count", ascending=False)
    .limit(10)
    .rdd.map(lambda row: int(row.document_id))
    .collect()
)

                                                                                

### 2. Users with at least 2 traffic sources

The sql function countDistinct can be one of the possible solutions. Here we just group views by user id, aggregate them by the number of distinct sources, filter it and count the results. 

In [20]:
from pyspark.sql.functions import countDistinct, col

users_with_multiple_traffic_sources = (
    page_views_sample.groupBy("uuid")
    .agg(countDistinct("traffic_source").alias("distinct_traffic_sources"))
    .filter("distinct_traffic_sources >= 2")
    .count()
)

                                                                                

### 3. Top 10 most visited topic ids

The idea here is the same as in the first task, the only thing we should do is to join the views table with the topics table. 

In [21]:
joined_df = page_views_sample.join(documents_topics, on="document_id", how="inner")

top_10_topics = (
    joined_df.groupBy("topic_id")
    .count()
    .orderBy("count", ascending=False)
    .limit(10)
    .rdd.map(lambda row: int(row.topic_id))
    .collect()
)

                                                                                

#### Save json

In [22]:
import json

result = {
    "top_10_documents": top_10_documents,
    "users": users_with_multiple_traffic_sources,
    "top_10_topics": top_10_topics,
}

with open("result.json", "w") as f:
    json.dump(result, f)

See the results:

In [23]:
result

{'top_10_documents': [1811567,
  234,
  42744,
  1858440,
  1780813,
  60164,
  1790442,
  1877626,
  1821895,
  732651],
 'users': 98080,
 'top_10_topics': [20, 16, 216, 136, 140, 143, 36, 97, 8, 269]}

In [24]:
!curl -F file=@result.json "51.250.54.133:80/MDS-LSML1/vadrad/w4/1"

1.0
Well done!
