# Initialization

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

findspark.init('/opt/spark')
MY_MONGO="mongodb://root:yurt@mongo:27017"
MONGO_URL="mongodb://root:yurt@mongo:27017/local.yield?authSource=local"


In [2]:
conf = SparkConf() \
    .set("spark.mongodb.input.uri", MONGO_URL) \
    .set("spark.mongodb.output.uri", MONGO_URL) \
    .setAppName("pysparkGeoIp")
sc = SparkContext(conf=conf)

spark = SparkSession(sc)


In [3]:
! hadoop fs -ls hdfs://node-master:9000/user/$USER

Found 1 items
drwxr-xr-x   - root supergroup          0 2021-09-12 09:54 hdfs://node-master:9000/user/root


In [4]:
! ls $PWD/

Dask-Yarn.ipynb		    GeoLite2-Country-CSV_20210831  datasets
GeoLite2-City-CSV_20210831  PySparkYieldDataOld.ipynb	   spark-warehouse


In [None]:
! unzip /root/lab/datasets/GeoLite2-Country-CSV_20210831.zip

In [None]:
! unzip /root/lab/datasets/GeoLite2-City-CSV_20210831.zip

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-Country-CSV_20210831/GeoLite2-Country-Blocks-IPv4.csv


In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-City-CSV_20210831/GeoLite2-City-Blocks-IPv4.csv

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-City-CSV_20210831/GeoLite2-City-Locations-en.csv

In [None]:
! hadoop fs -put -f $PWD/datasets/GeoLite2-Country-CSV_20210831/GeoLite2-Country-Locations-en.csv

In [None]:
! export URL_1="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-Country-CSV&date=20210831&suffix=zip&token=v2.local.zkJeIzHOqnNOtHqMT8hnpq_kxp5D8Rw3SDCRxlnkDbg8z3uzOtjxtLZqsqgR1OV9A1QfnFNWixb1UQtbZzYx9Kbcfylx8WbOWhmjlLQFIS4Eq_BrsRDI1kH766K9a0I40B9wSgmft_YUwxnJjCfS1jgPqGn9lBYDLXTJg_wYzTh2lSsEbIswtU19Al8XQN2zjVf0OQ" ; wget $URL_1 -O /tmp/geo_country.csv

In [None]:
! export URL_2="https://download.maxmind.com/app/geoip_download_by_token?edition_id=GeoLite2-City-CSV&date=20210831&suffix=zip&token=v2.local.11khj64wvdOZw1QcovhkAa4V2Nf4NHq7CG5TFWV7BKQ9vYRjJnhDxCL_TKpyYc9g_Yv9ZxnwYeH5hj920zkA-rabTt3wPN5Y8ebwiaJPJSNQgK0L1yYiKVvDWQw1VYW_EitmtCfwRseY52LjvFWZec4xIX_K-OUMZvB5H-nSIyiNF_38PgTrReDtF75GIGEP4ywjpw" ; wget $URL_2 -O /tmp/geo_city.csv

In [None]:
! hadoop fs -put /tmp/*.csv /user/root


In [None]:
! hadoop fs -ls /user/root/*csv


In [None]:
! hadoop dfs -ls hdfs://node-master:9000/user/$USER/*

In [None]:
! hadoop dfs -ls 

In [None]:
city_file='GeoLite2-City-Blocks-IPv4.csv'
city_ip_df=spark.read.csv(city_file, header=True)
city_ip_df.first()

In [None]:
country_file='GeoLite2-Country-Blocks-IPv4.csv'
country_ip_df=spark.read.csv(country_file, header=True)
country_ip_df.first()


In [None]:
country_nm_file='GeoLite2-Country-Locations-en.csv'
country_nm_df=spark.read.csv(country_nm_file, header=True)
country_nm_df.first()


In [None]:
city_nm_file='GeoLite2-City-Locations-en.csv'
city_nm_df=spark.read.csv(city_nm_file, header=True)
city_nm_df.first()

In [None]:
ip_to_location_df=country_ip_df.join(city_ip_df, country_ip_df.network == city_ip_df.network, "inner") \
   .join(country_nm_df,country_ip_df.geoname_id == country_nm_df.geoname_id,"inner") \
   .join(city_nm_df, city_ip_df.geoname_id == city_nm_df.geoname_id,"inner") \
   .select(country_ip_df.network, country_nm_df.country_name, city_nm_df.city_name)


In [None]:
formatted_lookup_df=ip_to_location_df.withColumn("network", f.split(country_ip_df['network'], '.0\/')[0])


In [None]:
formatted_lookup_df.first()


In [None]:
formatted_lookup_df.select("network").distinct().count()



In [None]:
formatted_lookup_df=ip_to_location_df.withColumn("network", f.split(country_ip_df['network'], '.0\/')[0])


In [None]:
formatted_lookup_df \
    .write.format("mongo") \
    .mode("append") \
    .options(uri=f"{MY_MONGO}", database="yield", collection="geoip") \
    .save()


### Analysis

In [None]:
from pyspark.sql.functions import split
unique_ip_addresses=schemaUsers.withColumn('ip', split(schemaUsers['ip'], ', ')[0]).select("ip").distinct()

unique_addresses_formatted = unique_ip_addresses.select(unique_ip_addresses.columns[0])

unique_addresses_formatted.count()


In [None]:
unique_ip_addresses=schemaUsers.withColumn('ip', split(schemaUsers['ip'], ', ')[0]).select("ip").distinct()


In [None]:
unique_addresses_formatted.printSchema()

In [None]:
unique_addresses_formatted.write.format("text").mode("Overwrite").save("unique_ip_addresses.txt")

In [None]:
from pymongo import MongoClient
# pprint library is used to make the output look more pretty
from pprint import pprint
# connect to MongoDB, change the << MONGODB URL >> to reflect your own connection string
client = MongoClient('mongodb://root:yurt@mongo:27017/')
db=client.admin
# Issue the serverStatus command and print the results
serverStatusResult=db.command("serverStatus")
pprint(serverStatusResult)
