<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/2021_11_01_MongoDB_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!curl ipecho.net/plain

In [None]:
# Install Spark 3.2.0 - JDK11
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!rm -f *.tgz

import os
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

!pip -q install findspark
!pip install -q dnspython

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pymongo import MongoClient

In [None]:
mongoURL = ""
mongoParam = "?authSource=admin&replicaSet=atlas-1wdpuy-shard-0&readPreference=primary"
mongoDB = "hadoop"

URI = mongoURL + mongoDB + mongoParam
print(URI)

spark = SparkSession.builder \
    .master("local") \
    .appName("Spark_MongoDB") \
    .config("spark.mongodb.input.uri", URI) \
    .config("spark.mongodb.output.uri", URI) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

sc = spark.sparkContext
sql = spark.sql

print(spark.version)

#Accesso al cluster MongoDB

In [None]:
people = spark.createDataFrame([("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195), ("Balin", 178), ("Kili", 77),
   ("Dwalin", 169), ("Oin", 167), ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"])

In [None]:
people.show()

##Write to MongoDB

In [None]:
people.write.format("com.mongodb.spark.sql.DefaultSource").option("collection","people").mode("overwrite").save()

In [None]:
people.write.format("mongo").option("collection","people").mode("append").save()

##Read from MongoDB

In [None]:
df = spark.read.format("mongo").option("collection","people").load()
df.printSchema()
df.show(truncate=False)

In [None]:
df = spark.read.format("mongo").option("collection","people").load().drop("_id")
df.printSchema()
df.select(df.name,df.age).show()

In [None]:
people = df.select(df.name,df.age)
people.show()

---

##New Collection

In [None]:
data = [{ "_id" : 1, "type" : "apple", "qty" : 5 },
        { "_id" : 2, "type" : "orange", "qty" : 10 },
        { "_id" : 3, "type" : "banana", "qty" : 15 }]
fruit = spark.createDataFrame(data)
fruit.printSchema()
fruit.show()

In [None]:
fruit.write.format("mongo").option("collection","fruit").mode("overwrite").save()


In [None]:
df = spark.read.format("mongo").option("collection","fruit").load()
df.printSchema()
df.show()

##Aggregation Pipeline

https://docs.mongodb.com/manual/core/aggregation-pipeline/

In [None]:
pipeline = "{'$match': {'type': 'apple'}}"
df = spark.read.format("mongo").option("collection","fruit").option("pipeline",pipeline).load()
df.show()

In [None]:
df = spark.read.format("mongo").option("collection","fruit").load()

df.filter(df['qty'] >= 10).show()

df.show()

In [None]:
df.createOrReplaceTempView("temp")
some_fruit = sql("SELECT type, qty FROM temp WHERE type LIKE '%e%'")
some_fruit.show()

#Caricamento da Open**Data**
https://dati.regione.sicilia.it/dataset

##Definizione di carica_dati

In [None]:
import requests

def carica_dati(url,collection,mode):
   file_name = url.split('/')[-1]
   r = requests.get(url, allow_redirects=True)
   open(file_name,"wb").write(r.content)
   #print(file_name)
   os.system("unzip "+file_name)
   input_file = file_name.split('.')[0]+'.csv'
   #print(input_file)
   #Create DataFrame
   print('Lettura di < '+input_file)
   arpa = spark.read.csv(input_file,sep=";",header=True,inferSchema=True)
   print('Salvataggio in > '+collection, '\t-\tModo: '+mode)
   #Save dataframe to MongoDB
   arpa.write.format("com.mongodb.spark.sql.DefaultSource").option("collection",collection).mode(mode).save()
   os.system("rm -f "+file_name)
   os.system("rm -f "+input_file)


##Caricamento anagrafica

In [None]:
url = 'https://dati.regione.sicilia.it/download/dataset/arpa-qualita-aria-anagrafica/filesystem/arpa-qualita-aria-anagrafica-stazioni_csv.zip'

carica_dati(url,"stazioni","overwrite")

In [None]:
url = 'https://dati.regione.sicilia.it/download/dataset/arpa-qualita-aria-anagrafica/filesystem/arpa-qualita-aria-anagrafica-inquinanti_csv.zip'

carica_dati(url,"inquinanti","overwrite")

In [None]:
df = spark.read.format("mongo").option("collection","stazioni").load().drop('_id')
df.printSchema()
df.sort(df.stazione_id).show(40,truncate=False)

In [None]:
stazioni = df.select(df.stazione_id,df.stazione_nome).sort(df.stazione_id)
stazioni.show(truncate=False)

In [None]:
st = stazioni.rdd.map(lambda x: (x[0],x[1]))
stazione = st.collectAsMap()
id_st = list(stazione.keys())
print(id_st)

In [None]:
df = spark.read.format("mongo").option("collection","inquinanti").load().drop('_id')
df.printSchema()
df.show(truncate=False)

##Creazione indice

In [None]:
inquinanti = df.select(df.inquinante_id,df.inquinante_simbolo,df.inquinante_descrizione).sort(df.inquinante_id)
inquinanti.show(truncate=False)

In [None]:
iq = inquinanti.rdd.map(lambda x: (x[0],x[1]))
type(iq)

In [None]:
inquinante = iq.collectAsMap()
id_iq = list(inquinante.keys())
print(id_iq)

In [None]:
for i in id_iq:
  print(i,'\t',inquinante.get(i))

In [None]:
for i in id_st:
  print(i,'\t',stazione.get(i))

##Caricamento dati

In [None]:
file_base = 'https://dati.regione.sicilia.it/download/dataset/arpa-qualita-aria-2019/filesystem/arpa-qualita-aria-2019-'

for i in indice:
  url = file_base+str(i)+'_csv.zip'
  print(url)
  #carica_dati(url,"aria","append")


##Lettura e verifica

In [None]:
df = spark.read.format("mongo").option("collection","aria").load().drop("_id")
df.printSchema()

In [None]:
df.filter(df.inquinante_id.isin(5,6001)).show()
df.filter(df.inquinante_id.isin(5,6001)).count()

##Cambio indice <-> etichetta

In [None]:
aria = df.rdd.map(lambda x: (inquinante.get(x[0]),x[1],x[2],x[3],x[4],stazione.get(x[5]))).toDF(df.columns)
aria.printSchema()
aria.show()

##Selezione in MongoDB (Pipeline)

In [None]:
pipeline = "{'$match': {'inquinante_id': 6001}}"

df_pm2_5 = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
               .option("collection","aria") \
               .option("pipeline", pipeline) \
               .load().drop('_id')
df_pm2_5.printSchema()
df_pm2_5.show()

##Selezione in Spark

In [None]:
df.filter(df['inquinante_id'] == 6001).show()

##Stop Session

In [None]:
spark.stop()