In [1]:
import findspark

findspark.init()

import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
alexa = spark.read.csv("/data/top-1m.csv")

In [4]:
alexa.show()

+---+---------------+
|_c0|            _c1|
+---+---------------+
|  1|     google.com|
|  2|    youtube.com|
|  3|   facebook.com|
|  4|      baidu.com|
|  5|  wikipedia.org|
|  6|         qq.com|
|  7|     taobao.com|
|  8|      yahoo.com|
|  9|      tmall.com|
| 10|     amazon.com|
| 11|    twitter.com|
| 12|       sohu.com|
| 13|         jd.com|
| 14|       live.com|
| 15|    sina.com.cn|
| 16|  instagram.com|
| 17|      weibo.com|
| 18|         360.cn|
| 19|login.tmall.com|
| 20|     reddit.com|
+---+---------------+
only showing top 20 rows



In [5]:
alexa.orderBy("_c0").show()

+-------+--------------------+
|    _c0|                 _c1|
+-------+--------------------+
|      1|          google.com|
|     10|          amazon.com|
|    100|           zhihu.com|
|   1000|        turnitin.com|
|  10000|             kit.edu|
| 100000|     educateiowa.gov|
|1000000|  onlythefitness.com|
| 100001|     wagnardsoft.com|
| 100002|         myhermes.at|
| 100003|         pvcloud.com|
| 100004|descargas-vagos.c...|
| 100005|       blogvporn.com|
| 100006|          duedil.com|
| 100007|      doctorpiter.ru|
| 100008|      sachbaitap.com|
| 100009|         iwmbuzz.com|
|  10001|       37signals.com|
| 100010|        torrentz2.cc|
| 100011|       technoxyz.com|
| 100012|      warkscol.ac.uk|
+-------+--------------------+
only showing top 20 rows



In [6]:
from pyspark.sql import types

schema = types.StructType().add("rank", "integer").add("site", "string")

real_alexa = spark.read.csv("/data/top-1m.csv", schema=schema)

In [7]:
real_alexa.show()

+----+---------------+
|rank|           site|
+----+---------------+
|   1|     google.com|
|   2|    youtube.com|
|   3|   facebook.com|
|   4|      baidu.com|
|   5|  wikipedia.org|
|   6|         qq.com|
|   7|     taobao.com|
|   8|      yahoo.com|
|   9|      tmall.com|
|  10|     amazon.com|
|  11|    twitter.com|
|  12|       sohu.com|
|  13|         jd.com|
|  14|       live.com|
|  15|    sina.com.cn|
|  16|  instagram.com|
|  17|      weibo.com|
|  18|         360.cn|
|  19|login.tmall.com|
|  20|     reddit.com|
+----+---------------+
only showing top 20 rows



In [8]:
real_alexa.orderBy("rank").show()

+----+---------------+
|rank|           site|
+----+---------------+
|   1|     google.com|
|   2|    youtube.com|
|   3|   facebook.com|
|   4|      baidu.com|
|   5|  wikipedia.org|
|   6|         qq.com|
|   7|     taobao.com|
|   8|      yahoo.com|
|   9|      tmall.com|
|  10|     amazon.com|
|  11|    twitter.com|
|  12|       sohu.com|
|  13|         jd.com|
|  14|       live.com|
|  15|    sina.com.cn|
|  16|  instagram.com|
|  17|      weibo.com|
|  18|         360.cn|
|  19|login.tmall.com|
|  20|     reddit.com|
+----+---------------+
only showing top 20 rows



In [9]:
real_alexa.printSchema()

root
 |-- rank: integer (nullable = true)
 |-- site: string (nullable = true)



In [10]:
from pyspark.sql import functions

(
    real_alexa
    .select(
        functions.explode(
            functions.split("site", "\.")
        ).alias("token")
    )
    .groupBy("token")
    .count()
    .withColumnRenamed("count", "shmount")
    .orderBy(functions.desc("shmount"))
    .show()
)

+--------+-------+
|   token|shmount|
+--------+-------+
|     com| 529951|
|     org|  54801|
|     net|  42212|
|      ru|  41774|
|      co|  35863|
|      de|  30184|
|      br|  21409|
|blogspot|  16691|
|      uk|  16631|
|      pl|  15019|
|      in|  13978|
|      ir|  12242|
|      it|  11209|
|      au|  10855|
|      jp|  10575|
|      fr|   9801|
|     gov|   9751|
|     edu|   8904|
|    info|   8335|
|      cz|   7750|
+--------+-------+
only showing top 20 rows



In [11]:
from pyspark.sql import functions

reverse_idx = (
    real_alexa
    .select(
        functions.explode(
            functions.split("site", "\.")
        ).alias("token"),
        functions.col("site"),
        functions.col("rank")
    )
    .groupBy("token")
    .agg(
        {
            "site": "collect_set",
            "rank": "max",
        }
    )
)


reverse_idx.cache()

DataFrame[token: string, collect_set(site): array<string>, max(rank): int]

In [12]:
reverse_idx.where("token = 'google'").show()

+------+--------------------+---------+
| token|   collect_set(site)|max(rank)|
+------+--------------------+---------+
|google|[safety.google, d...|   953902|
+------+--------------------+---------+



In [13]:
reverse_idx.where("token = 'ya'").show()

+-----+--------------------+---------+
|token|   collect_set(site)|max(rank)|
+-----+--------------------+---------+
|   ya|[ya.no, ya.com, y...|   596444|
+-----+--------------------+---------+



In [14]:
zgrab = spark.read.json("/data/zgrab.json")

In [15]:
zgrab.show()

+---------------+--------------------+--------------------+---------------+--------------------+
|_corrupt_record|                data|               error|             ip|           timestamp|
+---------------+--------------------+--------------------+---------------+--------------------+
|           null|               [[,]]|Get http://172.65...| 172.65.100.108|2018-05-13T02:30:...|
|           null|[[, [<html style=...|                null| 192.230.66.113|2018-05-13T02:30:...|
|           null|[[, [,,, [,,,,,,,...|                null|  159.65.233.69|2018-05-13T02:30:...|
|           null|               [[,]]|Get http://168.25...| 168.255.128.19|2018-05-13T02:30:...|
|           null|[[, [<!doctype ht...|                null| 185.162.171.15|2018-05-13T02:30:...|
|           null|[[, [<html style=...|                null| 31.201.254.208|2018-05-13T02:30:...|
|           null|[[, [<!DOCTYPE ht...|                null|  151.40.76.245|2018-05-13T02:30:...|
|           null|             

In [16]:
zgrab.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- http: struct (nullable = true)
 |    |    |-- redirect_response_chain: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- body: string (nullable = true)
 |    |    |    |    |-- body_sha256: string (nullable = true)
 |    |    |    |    |-- content_length: long (nullable = true)
 |    |    |    |    |-- headers: struct (nullable = true)
 |    |    |    |    |    |-- accept_ranges: array (nullable = true)
 |    |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |    |-- access_control_allow_origin: array (nullable = true)
 |    |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |    |-- cache_control: array (nullable = true)
 |    |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |    |-- connection: array (nullable = true)
 

In [17]:
zgrab.select("ip").show()

+---------------+
|             ip|
+---------------+
| 172.65.100.108|
| 192.230.66.113|
|  159.65.233.69|
| 168.255.128.19|
| 185.162.171.15|
| 31.201.254.208|
|  151.40.76.245|
|  112.72.33.128|
| 218.145.166.91|
|   67.222.68.11|
|   45.60.107.95|
| 118.169.10.116|
|   45.121.57.70|
|  45.195.160.26|
|115.131.234.198|
|115.128.144.139|
| 68.171.217.103|
|  123.60.36.156|
|    119.32.58.2|
|  168.255.58.46|
+---------------+
only showing top 20 rows

