# Initialization

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

findspark.init('/opt/spark')
MY_MONGO="mongodb://root:yurt@mongo:27017"


In [2]:
conf = SparkConf() \
    .set("spark.mongodb.input.uri", f"{MY_MONGO}/yield.geolookup") \
    .set("spark.mongodb.output.uri", "mongodb://root:yurt@mongo:27017/local.yield?authSource=local") \
    .setAppName("pysparkYieldData")
sc = SparkContext(conf=conf)

spark = SparkSession(sc)


In [3]:
! hadoop fs -ls hdfs://node-master:9000/user/$USER

Found 1 items
drwxr-xr-x   - root supergroup          0 2021-09-04 10:21 hdfs://node-master:9000/user/root


In [None]:
! ls $PWD/

In [None]:
! hadoop fs -put -f $PWD/datasets/input_data.gz


In [None]:
! unzip /root/lab/datasets/GeoLite2-Country-CSV_20210831.zip

In [None]:
! unzip /root/lab/datasets/GeoLite2-City-CSV_20210831.zip

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-Country-CSV_20210831/GeoLite2-Country-Blocks-IPv4.csv


In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-City-CSV_20210831/GeoLite2-City-Blocks-IPv4.csv

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-City-CSV_20210831/GeoLite2-City-Locations-en.csv

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-Country-CSV_20210831/GeoLite2-Country-Locations-en.csv

### GZ File

We can now see it as part of the HDFS

In [None]:
! hadoop fs -ls hdfs://node-master:9000/user/$USER/*

In [None]:
! hadoop fs -ls hdfs://node-master:9000/user/root/input_data.gz

In [4]:
parse_file = sc.textFile("input_data.gz")


In [5]:
parts = parse_file.map(lambda l: l.split("\t"))


In [6]:
from pyspark.sql import Row

user_activity = parts.map(lambda p: Row(
    date=p[0], 
    time=(p[1]),
    user_id=(p[2]),
    url=(p[3]),
    ip=(p[4]),
    user_agent_str=(p[5]),

))


In [7]:
schemaUsers = spark.createDataFrame(user_activity)


In [8]:
schemaUsers.printSchema

<bound method DataFrame.printSchema of DataFrame[date: string, time: string, user_id: string, url: string, ip: string, user_agent_str: string]>

In [9]:
schemaUsers.show()

+----------+--------+--------------------+--------------------+--------------------+--------------------+
|      date|    time|             user_id|                 url|                  ip|      user_agent_str|
+----------+--------+--------------------+--------------------+--------------------+--------------------+
|2014-10-12|17:01:01|f4fdd9e55192e9475...|http://6f2a9cab64...|       94.11.238.152|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|0ae53126499336757...|http://8eb4ac417c...|       92.238.71.109|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|c5ac174ee153f7e57...|https://1415d3778...|         2.26.44.196|Mozilla/5.0 (Linu...|
|2014-10-12|17:01:01|2d86766f9908fde41...|http://47e1f0cca5...|194.81.33.57, 66....|Mozilla/5.0 (Linu...|
|2014-10-12|17:01:01|3938fffe5c0a131f5...|https://978c17aed...|      109.152.120.12|Mozilla/5.0 (Wind...|
|2014-10-12|17:01:01|88eb65d5f952f3bf5...|http://38d6db9ae3...|         2.28.82.212|Mozilla/5.0 (iPad...|
|2014-10-12|17:01:01|068d17d3e73ea7aac...|http

### Function to retrieve geostats

In [10]:
from pyspark.sql.functions import split
unique_ip_addresses=schemaUsers.withColumn('ip', split(schemaUsers['ip'], ', ')[0]).select("ip").distinct()

unique_addresses_formatted = unique_ip_addresses.select(unique_ip_addresses.columns[0])
# unique_ip_addresses=schemaUsers.select("ip").distinct().withColumn('ip', split(schemaUsers['ip'], ', '))

unique_addresses_formatted.count()


24667

In [11]:
unique_addresses_formatted.printSchema()

root
 |-- ip: string (nullable = true)



In [12]:
unique_addresses_formatted.write.format("text").mode("Overwrite").save("unique_ip_addresses.txt")

In [13]:
! hadoop dfs -ls hdfs://node-master:9000/user/$USER/*


Found 7 items
drwxr-xr-x   - root supergroup          0 2021-09-04 10:24 hdfs://node-master:9000/user/root/.sparkStaging
-rw-r--r--   2 root supergroup  221476423 2021-09-04 09:56 hdfs://node-master:9000/user/root/GeoLite2-City-Blocks-IPv4.csv
-rw-r--r--   2 root supergroup   11952196 2021-09-04 09:56 hdfs://node-master:9000/user/root/GeoLite2-City-Locations-en.csv
-rw-r--r--   2 root supergroup   12829438 2021-09-04 09:56 hdfs://node-master:9000/user/root/GeoLite2-Country-Blocks-IPv4.csv
-rw-r--r--   2 root supergroup       9889 2021-09-04 09:56 hdfs://node-master:9000/user/root/GeoLite2-Country-Locations-en.csv
-rw-r--r--   2 root supergroup    7866234 2021-09-04 10:21 hdfs://node-master:9000/user/root/input_data.gz
drwxr-xr-x   - root supergroup          0 2021-09-04 10:24 hdfs://node-master:9000/user/root/unique_ip_addresses.txt


In [14]:
! hadoop dfs -ls 


Found 7 items
drwxr-xr-x   - root supergroup          0 2021-09-04 10:24 .sparkStaging
-rw-r--r--   2 root supergroup  221476423 2021-09-04 09:56 GeoLite2-City-Blocks-IPv4.csv
-rw-r--r--   2 root supergroup   11952196 2021-09-04 09:56 GeoLite2-City-Locations-en.csv
-rw-r--r--   2 root supergroup   12829438 2021-09-04 09:56 GeoLite2-Country-Blocks-IPv4.csv
-rw-r--r--   2 root supergroup       9889 2021-09-04 09:56 GeoLite2-Country-Locations-en.csv
-rw-r--r--   2 root supergroup    7866234 2021-09-04 10:21 input_data.gz
drwxr-xr-x   - root supergroup          0 2021-09-04 10:24 unique_ip_addresses.txt


In [15]:
city_file='GeoLite2-City-Blocks-IPv4.csv'
city_ip_df=spark.read.csv(city_file, header=True)
city_ip_df.first()

Row(network='1.0.0.0/24', geoname_id='2077456', registered_country_geoname_id='2077456', represented_country_geoname_id=None, is_anonymous_proxy='0', is_satellite_provider='0', postal_code=None, latitude='-33.4940', longitude='143.2104', accuracy_radius='1000')

In [16]:
country_file='GeoLite2-Country-Blocks-IPv4.csv'
country_ip_df=spark.read.csv(country_file, header=True)
country_ip_df.first()


Row(network='1.0.0.0/24', geoname_id='2077456', registered_country_geoname_id='2077456', represented_country_geoname_id=None, is_anonymous_proxy='0', is_satellite_provider='0')

In [17]:
country_nm_file='GeoLite2-Country-Locations-en.csv'
country_nm_df=spark.read.csv(country_nm_file, header=True)
country_nm_df.first()


Row(geoname_id='49518', locale_code='en', continent_code='AF', continent_name='Africa', country_iso_code='RW', country_name='Rwanda', is_in_european_union='0')

In [18]:
city_nm_file='GeoLite2-City-Locations-en.csv'
city_nm_df=spark.read.csv(city_nm_file, header=True)
city_nm_df.first()

Row(geoname_id='1392', locale_code='en', continent_code='AS', continent_name='Asia', country_iso_code='IR', country_name='Iran', subdivision_1_iso_code='02', subdivision_1_name='Māzandarān', subdivision_2_iso_code=None, subdivision_2_name=None, city_name='Shahr', metro_code=None, time_zone='Asia/Tehran', is_in_european_union='0')

In [19]:
import pyspark.sql.functions as f


In [20]:
ip_to_location_df=country_ip_df.join(city_ip_df, country_ip_df.network == city_ip_df.network, "inner") \
   .join(country_nm_df,country_ip_df.geoname_id == country_nm_df.geoname_id,"inner") \
   .join(city_nm_df, city_ip_df.geoname_id == city_nm_df.geoname_id,"inner") \
   .select(country_ip_df.network, country_nm_df.country_name, city_nm_df.city_name)


In [21]:
formatted_lookup_df=ip_to_location_df.withColumn("network", f.split(country_ip_df['network'], '.0\/')[0])


In [22]:
formatted_lookup_df.first()


Row(network='1.0.0', country_name='Australia', city_name=None)

In [23]:
formatted_lookup_df.select("network").distinct().count()

# df
#   .select("country")
#   .distinct
#   .withColumn("country", concat(col("country"), lit(" is fun!")))
#   .show()


279442

In [29]:
formatted_lookup_df \
    .write.format("mongo") \
    .mode("append") \
    .options(uri=f"{MY_MONGO}", database="yield", collection="geoip") \
    .save()


In [None]:
from pymongo import MongoClient
# pprint library is used to make the output look more pretty
from pprint import pprint
# connect to MongoDB, change the << MONGODB URL >> to reflect your own connection string
client = MongoClient('mongodb://root:yurt@mongo:27017/')
db=client.admin
# Issue the serverStatus command and print the results
serverStatusResult=db.command("serverStatus")
pprint(serverStatusResult)


In [None]:
! pip install pgeocode

In [None]:
! pip --version

In [None]:
! export URL_1="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-Country-CSV&date=20210831&suffix=zip&token=v2.local.zkJeIzHOqnNOtHqMT8hnpq_kxp5D8Rw3SDCRxlnkDbg8z3uzOtjxtLZqsqgR1OV9A1QfnFNWixb1UQtbZzYx9Kbcfylx8WbOWhmjlLQFIS4Eq_BrsRDI1kH766K9a0I40B9wSgmft_YUwxnJjCfS1jgPqGn9lBYDLXTJg_wYzTh2lSsEbIswtU19Al8XQN2zjVf0OQ" ; wget $URL_1 -O /tmp/geo_country.csv

In [None]:
! export URL_2="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-City-CSV&date=20210831&suffix=zip&token=v2.local.11khj64wvdOZw1QcovhkAa4V2Nf4NHq7CG5TFWV7BKQ9vYRjJnhDxCL_TKpyYc9g_Yv9ZxnwYeH5hj920zkA-rabTt3wPN5Y8ebwiaJPJSNQgK0L1yYiKVvDWQw1VYW_EitmtCfwRseY52LjvFWZec4xIX_K-OUMZvB5H-nSIyiNF_38PgTrReDtF75GIGEP4ywjpw" ; wget $URL_2 -O /tmp/geo_city.csv

In [None]:
! hadoop fs -put /tmp/*.csv /user/root


In [None]:
! hadoop fs -ls /user/root/*csv


## Twighlight Zone

In [None]:
import geoip2.webservice

# This reader object should be reused across lookups as creation of it is
# expensive.
accid="yurty"
key="yurt"
with geoip2.webservice.Client(accid, key) as client:
    response = client.city('203.0.113.0')
    print(response.country.name)


In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import requests
api_key="yurt"

# @udf
def get_country_from_ip(ip_address):
    url= f"http://api.ipapi.com/{ip_address}?access_key={api_key}"
    r = requests.get(url)
    ip_response_json=r.json()

    country=ip_response_json['country_name']
    city=ip_response_json['city']
    return (country, city)




In [None]:
! touch maxmind-database.mmdb

In [None]:
! pip install geoip2

### Expensive! But hey..

In [None]:
# unique_ip_addresses.rdd.map(lambda row: row.asDict())


ip_array = [str(row.ip) for row in unique_ip_addresses.collect()]

ip_list=[]
for ip in ip_array:
    if ', ' in ip:
        split_ip=ip.split(', ')
        ip_list.append({ip: get_country_from_ip(split_ip[0])})
        ip_list.append({ip: get_country_from_ip(split_ip[1])})

    else:
        ip_list.append({ip: get_country_from_ip(ip)})
        
len(ip_list)

#### Failed on hitting limit API License for 24 k requests


### Cheaper but time consuming..

### Unique IP Addresses

In [None]:
get_country_from_ip = udf(get_country_from_ip, ArrayType(StringType()))


### Computer says no...

In [None]:
unique_ip_addresses.select(*[get_country_from_ip('ip')])

In [None]:
ip_geo_df = unique_ip_addresses.select(get_country_from_ip("ip").alias("geolocation"))


In [None]:
ip_geo_df.show()

In [None]:
# for ip in unique_ip_addresses_list:
#     print('{}: {}'.format(type(ip), ip))

dict(unique_ip_addresses_list)

In [None]:
import requests
# api_key=""
ip_address="188.141.30.136"

In [None]:
get_ip_details(ip_address)

In [None]:
ip_response_json