<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/2021_10_28_Spark_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Spark 3.2.0 - JDK11
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!rm -f *.tgz

import os
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

#Install findspark using pip to make pyspark importable as regular library
!pip -q install findspark
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sql = spark.sql

print("\nApache Spark version: ", spark.version)

##Pandas

In [None]:
import pandas as pd    
data = [["James","","Smith",30,"M",60000], 
        ["Michael","Rose","",50,"M",70000], 
        ["Robert","","Williams",42,"",400000], 
        ["Maria","Anne","Jones",38,"F",500000], 
        ["Jen","Mary","Brown",45,None,0]] 
columns = ['First Name', 'Middle Name','Last Name','Age','Gender','Salary']

In [None]:
# Create the pandas DataFrame 
pandasDF = pd.DataFrame(data=data, columns=columns)

In [None]:
# print dataframe. 
print(pandasDF)

In [None]:
#Outputs below data on console

pdCount=pandasDF.count()
print(pdCount)

In [None]:
print(pandasDF.max())

In [None]:
print(pandasDF.mean())

##Convert Column Python

###Example 1

In [None]:
data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
    ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
  ]
columns=["firstname","lastname","country","state"]
df=spark.createDataFrame(data=data,schema=columns)
df.show()
print(df.collect())

In [None]:
states1=df.rdd.map(lambda x: x[3]).collect()
print(states1)

from collections import OrderedDict 
res = list(OrderedDict.fromkeys(states1)) 
print(res)

###Example 2

In [None]:
states2=df.rdd.map(lambda x: x.state).collect()
print(states2)
#['CA', 'NY', 'CA', 'FL']

In [None]:
states3=df.select(df.state).collect()
print(states3)

In [None]:
states4=df.select(df.state).rdd.flatMap(lambda x: x).collect()
print(states4)

In [None]:
states5=df.select(df.state).toPandas()['state']
print(states5)

states6=list(states5)
print(states6)

In [None]:
pandDF=df.select(df.state,df.firstname).toPandas()
print(list(pandDF['state']))
print(list(pandDF['firstname']))

##Date

In [None]:
from pyspark.sql.functions import col, expr
data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)]

print(data)

spark.createDataFrame(data).toDF("date","increment") \
  .select(col("date"),col("increment"), \
      expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").alias("inc_date")) \
  .show()

##Aggregate

In [None]:
from pyspark.sql.functions import approx_count_distinct, collect_list
from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness 
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance, var_samp, var_pop

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [None]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]

df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

In [None]:
print("approx_count_distinct: " + str(df.select(approx_count_distinct("salary")).collect()[0][0]))

In [None]:
print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

In [None]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()))

In [None]:
print("Distinct Count of Department & Salary: "+str(df2.collect()[0]))

In [None]:
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

In [None]:
print("count: "+str(df.select(count("salary")).collect()[0]))

df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)

Nella teoria della probabilità e nella statistica, la distribuzione normale asimmetrica è una distribuzione di probabilità continua che generalizza la distribuzione normale per consentire l'asimmetria *(skewness)* diversa da zero.

In [None]:
df.select(skewness("salary")).show(truncate=False)

La curtosi *(kurtosis)* è principalmente una misura per descrivere la forma di una distribuzione di probabilità e in particolare la sua "coda".

In [None]:
df.select(kurtosis("salary")).show(truncate=False)

Le funzioni STDDEV_POP() e STDDEV_SAMP() calcolano rispettivamente la deviazione standard della popolazione e la deviazione standard del campione dei valori di input. (STDDEV() è un alias per STDDEV_SAMP().) Entrambe le funzioni valutano tutte le righe di input corrispondenti alla query. La differenza è che STDDEV_SAMP() viene ridimensionato di 1/(N-1) mentre STDDEV_POP() viene ridimensionato di 1/N.

In [None]:
df.select(stddev("salary"),stddev_samp("salary"),stddev_pop("salary")).show(truncate=False)

In [None]:
df.select(variance("salary"),var_samp("salary"),var_pop("salary")).show(truncate=False)

##Array & String

In [None]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

In [None]:
from pyspark.sql.functions import col, concat_ws
df2 = df.withColumn("languagesAtSchool", concat_ws(",",col("languagesAtSchool")))
df2.printSchema()
df2.show(truncate=False)

In [None]:
df.createOrReplaceTempView("ARRAY_STRING")
spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool," + \
    " currentState from ARRAY_STRING").show(truncate=False)

##Array Type

In [None]:
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
arrayCol = ArrayType(StringType(),False)

data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True) 
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()

In [None]:
from pyspark.sql.functions import explode
df.select(df.name,explode(df.languagesAtSchool)).show()

In [None]:
from pyspark.sql.functions import split
df.select(split(df.name,",").alias("nameAsArray")).show()

In [None]:
from pyspark.sql.functions import array
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()

In [None]:
from pyspark.sql.functions import array_contains
df.select(df.name,array_contains(df.languagesAtSchool,"Java").alias("array_contains")).show()

In [None]:
df.show()

##Broadcast DataFrame

In [None]:
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

In [None]:
def state_convert(code):
    return broadcastStates.value[code]

In [None]:
result = df.rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).toDF(columns)
result.show(truncate=False)

##Cast Column

In [None]:
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
    ("Michael",33,"1980-01-10","true","F",3300.80),
    ("Robert",37,"06-01-1992","false","M",5000.50)
  ]

columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, BooleanType, DateType
df2 = df.withColumn("age",col("age").cast(StringType())) \
    .withColumn("isGraduated",col("isGraduated").cast(BooleanType())) \
    .withColumn("jobStartDate",col("jobStartDate").cast(DateType()))
df2.printSchema()

In [None]:
df3 = df2.selectExpr("cast(age as int) age",
    "cast(isGraduated as string) isGraduated",
    "cast(jobStartDate as string) jobStartDate")
df3.printSchema()
df3.show(truncate=False)

In [None]:
df3.createOrReplaceTempView("CastExample")
df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample")
df4.printSchema()
df4.show(truncate=False)

##Change string --> double

In [None]:
from pyspark.sql.types import DoubleType, IntegerType

simpleData = [("James","34","true","M","3000.6089"),
    ("Michael","33","true","F","3300.8067"),
    ("Robert","37","false","M","5000.5034")
  ]

columns = ["firstname","age","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

In [None]:
from pyspark.sql.functions import col,round,expr
df.withColumn("salary",df.salary.cast('double')).printSchema()    
df.withColumn("salary",df.salary.cast(DoubleType())).printSchema()    
df.withColumn("salary",col("salary").cast('double')).printSchema()

In [None]:
df.selectExpr("firstname","isGraduated","cast(salary as double) salary").printSchema()

In [None]:
df.createOrReplaceTempView("CastExample")
spark.sql("SELECT firstname,isGraduated,DOUBLE(salary) as salary from CastExample").printSchema()