# Initialization

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

findspark.init('/opt/spark')

In [2]:
conf = SparkConf().setAppName("pysparkYieldData")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


In [3]:
! hadoop fs -ls hdfs://node-master:9000/user/$USER

Found 1 items
drwxr-xr-x   - root supergroup          0 2021-09-01 19:11 hdfs://node-master:9000/user/root


In [4]:
! ls $PWD/

Bash-Interface.ipynb  PySparkYieldData.ipynb  dask-worker-space
Dask-Yarn.ipynb       Python-Spark.ipynb      datasets


In [None]:
! hadoop fs -put $PWD/datasets/input_data.gz


### GZ File

We can now see it as part of the HDFS

In [None]:
! hadoop fs -ls hdfs://node-master:9000/user/$USER/*

In [None]:
! hadoop fs -ls hdfs://node-master:9000/user/root/input_data.gz

In [5]:
parse_file = sc.textFile("input_data.gz")


In [6]:
parts = parse_file.map(lambda l: l.split("\t"))


In [7]:
from pyspark.sql import Row

user_activity = parts.map(lambda p: Row(
    date=p[0], 
    time=(p[1]),
    user_id=(p[2]),
    url=(p[3]),
    ip=(p[4]),
    user_agent_str=(p[5]),

))


In [8]:
schemaUsers = spark.createDataFrame(user_activity)


In [9]:
schemaUsers.printSchema

<bound method DataFrame.printSchema of DataFrame[date: string, time: string, user_id: string, url: string, ip: string, user_agent_str: string]>

In [10]:
schemaUsers.show()

+----------+--------+--------------------+--------------------+--------------------+--------------------+
|      date|    time|             user_id|                 url|                  ip|      user_agent_str|
+----------+--------+--------------------+--------------------+--------------------+--------------------+
|2014-10-12|17:01:01|f4fdd9e55192e9475...|http://6f2a9cab64...|       94.11.238.152|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|0ae53126499336757...|http://8eb4ac417c...|       92.238.71.109|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|c5ac174ee153f7e57...|https://1415d3778...|         2.26.44.196|Mozilla/5.0 (Linu...|
|2014-10-12|17:01:01|2d86766f9908fde41...|http://47e1f0cca5...|194.81.33.57, 66....|Mozilla/5.0 (Linu...|
|2014-10-12|17:01:01|3938fffe5c0a131f5...|https://978c17aed...|      109.152.120.12|Mozilla/5.0 (Wind...|
|2014-10-12|17:01:01|88eb65d5f952f3bf5...|http://38d6db9ae3...|         2.28.82.212|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|068d17d3e73ea7aac...|http

### Function to retrieve geostats

In [60]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import requests
api_key="7c6407327a9eaf157578e80dfea828e9"

# @udf
def get_country_from_ip(ip_address):
    url= f"http://api.ipapi.com/{ip_address}?access_key={api_key}"
    r = requests.get(url)
    ip_response_json=r.json()

    country=ip_response_json['country_name']
    city=ip_response_json['city']
    return (country, city)




In [68]:
unique_ip_addresses.printSchema()

root
 |-- ip: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [84]:
from pyspark.sql.functions import split
unique_ip_addresses=schemaUsers.withColumn('ip', split(schemaUsers['ip'], ', ')[0]).select("ip").distinct()

unique_addresses_formatted = unique_ip_addresses.select(unique_ip_addresses.columns[0])
# unique_ip_addresses=schemaUsers.select("ip").distinct().withColumn('ip', split(schemaUsers['ip'], ', '))

unique_addresses.count()


24820

In [85]:
unique_addresses_formatted.printSchema()

root
 |-- ip: string (nullable = true)



In [87]:
unique_addresses_formatted.write.format("text").save("ip_addresses.txt")

In [88]:
! hadoop dfs -ls hdfs://node-master:9000/user/$USER/*


Found 4 items
drwxr-xr-x   - root supergroup          0 2021-09-01 19:36 hdfs://node-master:9000/user/root/.skein
drwxr-xr-x   - root supergroup          0 2021-09-01 19:36 hdfs://node-master:9000/user/root/.sparkStaging
-rw-r--r--   2 root supergroup    7866234 2021-09-01 17:15 hdfs://node-master:9000/user/root/input_data.gz
drwxr-xr-x   - root supergroup          0 2021-09-01 21:36 hdfs://node-master:9000/user/root/ip_addresses.txt


In [86]:
! pip --version

pip 21.2.4 from /usr/local/lib/python3.6/dist-packages/pip (python 3.6)


In [106]:
! export URL_1="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-Country-CSV&date=20210831&suffix=zip&token=v2.local.zkJeIzHOqnNOtHqMT8hnpq_kxp5D8Rw3SDCRxlnkDbg8z3uzOtjxtLZqsqgR1OV9A1QfnFNWixb1UQtbZzYx9Kbcfylx8WbOWhmjlLQFIS4Eq_BrsRDI1kH766K9a0I40B9wSgmft_YUwxnJjCfS1jgPqGn9lBYDLXTJg_wYzTh2lSsEbIswtU19Al8XQN2zjVf0OQ" ; wget $URL_1 -O /tmp/geo_country.csv

--2021-09-01 21:50:02--  https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-Country-CSV&date=20210831&suffix=zip&token=v2.local.zkJeIzHOqnNOtHqMT8hnpq_kxp5D8Rw3SDCRxlnkDbg8z3uzOtjxtLZqsqgR1OV9A1QfnFNWixb1UQtbZzYx9Kbcfylx8WbOWhmjlLQFIS4Eq_BrsRDI1kH766K9a0I40B9wSgmft_YUwxnJjCfS1jgPqGn9lBYDLXTJg_wYzTh2lSsEbIswtU19Al8XQN2zjVf0OQ
Resolving download.maxmind.com (download.maxmind.com)... 104.16.38.47, 104.16.37.47, 2606:4700::6810:262f, ...
Connecting to download.maxmind.com (download.maxmind.com)|104.16.38.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3586289 (3.4M) [application/zip]
Saving to: '/tmp/geo_country.csv'


2021-09-01 21:50:03 (12.0 MB/s) - '/tmp/geo_country.csv' saved [3586289/3586289]



In [107]:
! export URL_2="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-City-CSV&date=20210831&suffix=zip&token=v2.local.11khj64wvdOZw1QcovhkAa4V2Nf4NHq7CG5TFWV7BKQ9vYRjJnhDxCL_TKpyYc9g_Yv9ZxnwYeH5hj920zkA-rabTt3wPN5Y8ebwiaJPJSNQgK0L1yYiKVvDWQw1VYW_EitmtCfwRseY52LjvFWZec4xIX_K-OUMZvB5H-nSIyiNF_38PgTrReDtF75GIGEP4ywjpw" ; wget $URL_2 -O /tmp/geo_city.csv

--2021-09-01 21:51:09--  https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-City-CSV&date=20210831&suffix=zip&token=v2.local.11khj64wvdOZw1QcovhkAa4V2Nf4NHq7CG5TFWV7BKQ9vYRjJnhDxCL_TKpyYc9g_Yv9ZxnwYeH5hj920zkA-rabTt3wPN5Y8ebwiaJPJSNQgK0L1yYiKVvDWQw1VYW_EitmtCfwRseY52LjvFWZec4xIX_K-OUMZvB5H-nSIyiNF_38PgTrReDtF75GIGEP4ywjpw
Resolving download.maxmind.com (download.maxmind.com)... 104.16.37.47, 104.16.38.47, 2606:4700::6810:262f, ...
Connecting to download.maxmind.com (download.maxmind.com)|104.16.37.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50885905 (49M) [application/zip]
Saving to: '/tmp/geo_city.csv'


2021-09-01 21:51:12 (35.2 MB/s) - '/tmp/geo_city.csv' saved [50885905/50885905]



In [110]:
! hadoop fs -put /tmp/*.csv /user/root


In [112]:
! hadoop fs -ls /user/root/*csv


-rw-r--r--   2 root supergroup   50885905 2021-09-01 21:53 /user/root/geo_city.csv
-rw-r--r--   2 root supergroup    3586289 2021-09-01 21:53 /user/root/geo_country.csv


## Twighlight Zone

In [97]:
import geoip2.webservice

# This reader object should be reused across lookups as creation of it is
# expensive.
accid="yurty"
key="yurt"
with geoip2.webservice.Client(accid, key) as client:
    response = client.city('203.0.113.0')
    print(response.country.name)


PermissionRequiredError: You do not have permission to use this service interface.

In [94]:
! touch maxmind-database.mmdb

In [89]:
! pip install geoip2

Collecting geoip2
  Downloading geoip2-4.2.0-py2.py3-none-any.whl (25 kB)
Collecting requests<3.0.0,>=2.24.0
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
[K     |################################| 62 kB 1.2 MB/s eta 0:00:01
[?25hCollecting urllib3<2.0.0,>=1.25.2
  Downloading urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
[K     |################################| 138 kB 2.8 MB/s eta 0:00:01
[?25hCollecting maxminddb<3.0.0,>=2.0.0
  Downloading maxminddb-2.0.3.tar.gz (286 kB)
[K     |################################| 286 kB 8.6 MB/s eta 0:00:01
Collecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.4-py3-none-any.whl (36 kB)
Building wheels for collected packages: maxminddb
  Building wheel for maxminddb (setup.py) ... [?25ldone
[?25h  Created wheel for maxminddb: filename=maxminddb-2.0.3-py2.py3-none-any.whl size=13627 sha256=15af01ba9cf595a6502c27ede61e7427c84e2b36beb97e77f7177eeb2504e959
  Stored in directory: /root/.cache/pip/wheels/5f/2a/dc/6297

### Expensive! But hey..

In [61]:
# unique_ip_addresses.rdd.map(lambda row: row.asDict())


ip_array = [str(row.ip) for row in unique_ip_addresses.collect()]

ip_list=[]
for ip in ip_array:
    if ', ' in ip:
        split_ip=ip.split(', ')
        ip_list.append({ip: get_country_from_ip(split_ip[0])})
        ip_list.append({ip: get_country_from_ip(split_ip[1])})

    else:
        ip_list.append({ip: get_country_from_ip(ip)})
        
len(ip_list)

KeyError: 'country_name'

#### Failed on hitting limit API License for 24 k requests


### Cheaper but time consuming..

### Unique IP Addresses

In [40]:
get_country_from_ip = udf(get_country_from_ip, ArrayType(StringType()))


### Computer says no...

In [45]:
unique_ip_addresses.select(*[get_country_from_ip('ip')])

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/pyspark/sql/udf.py", line 199, in wrapper
    return self(*args)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 177, in __call__
    judf = self._judf
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 161, in _judf
    self._judf_placeholder = self._create_judf()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 167, in _create_judf
    spark = SparkSession.builder.getOrCreate()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/session.py", line 228, in getOrCreate
    sc = SparkContext.getOrCreate(sparkConf)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 384, in getOrCreate
    SparkContext(conf=conf or SparkConf())
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 136, in __init__
    SparkContext._assert_on_driver()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 1277, in _assert_on_driver
    raise Exception("SparkContext should only be created and accessed on the driver.")
Exception: SparkContext should only be created and accessed on the driver.


In [46]:
ip_geo_df = unique_ip_addresses.select(get_country_from_ip("ip").alias("geolocation"))


/bin/sh: 1: Bad substitution


In [47]:
ip_geo_df.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/pyspark/sql/udf.py", line 199, in wrapper
    return self(*args)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 177, in __call__
    judf = self._judf
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 161, in _judf
    self._judf_placeholder = self._create_judf()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/udf.py", line 167, in _create_judf
    spark = SparkSession.builder.getOrCreate()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/sql/session.py", line 228, in getOrCreate
    sc = SparkContext.getOrCreate(sparkConf)
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 384, in getOrCreate
    SparkContext(conf=conf or SparkConf())
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 136, in __init__
    SparkContext._assert_on_driver()
  File "/tmp/hadoop-root/nm-local-dir/usercache/root/appcache/application_1630516469872_0005/container_1630516469872_0005_01_000002/pyspark.zip/pyspark/context.py", line 1277, in _assert_on_driver
    raise Exception("SparkContext should only be created and accessed on the driver.")
Exception: SparkContext should only be created and accessed on the driver.


In [None]:
# for ip in unique_ip_addresses_list:
#     print('{}: {}'.format(type(ip), ip))

dict(unique_ip_addresses_list)

In [None]:
import requests
# api_key=""
ip_address="188.141.30.136"

In [None]:
get_ip_details(ip_address)

In [None]:
ip_response_json