<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/2021_10_30_Spark_Examples_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Spark 3.2.0 - JDK11
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!rm -f *.tgz

import os
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

#Install findspark using pip to make pyspark importable as regular library
!pip -q install findspark
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sql = spark.sql

print("\nApache Spark version: ", spark.version)

#Column to Map

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

data = [ ("36636","Finance",3000,"USA"), 
    ("40288","Finance",5000,"IND"), 
    ("42114","Sales",3900,"USA"), 
    ("39192","Marketing",2500,"CAN"), 
    ("34534","Sales",6500,"USA") ]
schema = StructType([
     StructField('id', StringType(), True),
     StructField('dept', StringType(), True),
     StructField('salary', IntegerType(), True),
     StructField('location', StringType(), True)
     ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

##Convert columns to Map


In [None]:
from pyspark.sql.functions import col,lit,create_map
df = df.withColumn("propertiesMap",create_map(lit("salary"),col("salary"),lit("location"),col("location"))).drop("salary","location")
df.printSchema()
df.show(truncate=False)

---

In [None]:
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]

df = spark.createDataFrame(data=dataDictionary, schema = ['name','properties'])
df.printSchema()
df.show(truncate=False)

##Map to columns


In [None]:
df3=df.rdd.map(lambda x: (x.name,x.properties["hair"],x.properties["eye"])).toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

In [None]:
df.withColumn("hair",df.properties.getItem("hair")).withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties").show()

df.withColumn("hair",df.properties["hair"]).withColumn("eye",df.properties["eye"]) \
  .drop("properties").show()

*Functions*

In [None]:
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()

#Dataframe Dictionary

In [None]:
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]

df = spark.createDataFrame(data=dataDictionary, schema = ['name','properties'])
df.printSchema()
df.show(truncate=False)

## Using StructType schema


In [None]:
from pyspark.sql.types import StructField, StructType, StringType, MapType,IntegerType
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])
df2 = spark.createDataFrame(data=dataDictionary, schema = schema)
df2.printSchema()
df2.show(truncate=False)

In [None]:
df3=df.rdd.map(lambda x: (x.name,x.properties["hair"],x.properties["eye"])).toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

In [None]:
df.withColumn("hair",df.properties.getItem("hair")).withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties").show()

df.withColumn("hair",df.properties["hair"]).withColumn("eye",df.properties["eye"]) \
  .drop("properties").show()

*Functions*

In [None]:
# Functions
from pyspark.sql.functions import explode,map_keys,col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x:x[0]).collect()
keyCols = list(map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()

#Current Date

In [None]:
data=[["1"]]
df=spark.createDataFrame(data,["id"])
df.show()

from pyspark.sql.functions import *

##current_date() & current_timestamp()


In [None]:
df.withColumn("current_date",current_date()).withColumn("current_timestamp",current_timestamp()).show(truncate=False)

###SQL


In [None]:
sql("select current_date(), current_timestamp()").show(truncate=False)

## Date & Timestamp into custom format


In [None]:
df.withColumn("date_format",date_format(current_date(),"MM-dd-yyyy")) \
  .withColumn("to_timestamp",to_timestamp(current_timestamp(),"MM-dd-yyyy HH mm ss SSS")) \
  .show(truncate=False)

###SQL


In [None]:
sql("select date_format(current_date(),'MM-dd-yyyy') as date_format ,to_timestamp(current_timestamp(),'MM-dd-yyyy HH mm ss SSS') as to_timestamp").show(truncate=False)

#Dataframe repatition

In [None]:
df=spark.range(0,20)
print(df.rdd.getNumPartitions())

df.write.mode("overwrite").csv("partition.csv")
df.show()

In [None]:
!ls -la

##repartition()

Il metodo Spark RDD **repartition()** viene utilizzato per aumentare o diminuire le partizioni.

In [None]:
df2 = df.repartition(6)
print(df2.rdd.getNumPartitions())
df2.show()

##coalesce()

Spark RDD **coalesce()** viene utilizzato solo per ridurre il numero di partizioni.

In [None]:
df3 = df.coalesce(2)
print(df3.rdd.getNumPartitions())
df3.show()

##groupBy()

In [None]:
df4 = df.groupBy("id").count()
print(df4.rdd.getNumPartitions())
df4.show()