<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/2021_10_30_Spark_Examples_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Spark 3.2.0 - JDK11
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!rm -f *.tgz

import os
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

#Install findspark using pip to make pyspark importable as regular library
!pip -q install findspark
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sql = spark.sql

print("\nApache Spark version: ", spark.version)

https://spark.apache.org/docs/latest/sql-programming-guide.html

#Column Functions

In [None]:
data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)
df.show()
df.printSchema()

##alias

In [None]:
from pyspark.sql.functions import expr

df2 = df.select(df.fname.alias("first_name"), \
          df.lname.alias("last_name"), \
          expr(" fname ||','|| lname").alias("fullName") \
   )

In [None]:
df2.show()

##asc, desc


In [None]:
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()

##cast


In [None]:
df.select(df.fname,df.id.cast("int")).printSchema()

In [None]:
df.select(df.fname,df.id.cast("int")).show()

##between


In [None]:
df.filter(df.id.between(100,300)).show()

##contains


In [None]:
df.filter(df.fname.contains("Cruise")).show()

##startswith, endswith


In [None]:
df.filter(df.fname.startswith("T")).show()
df.filter(df.fname.endswith("Cruise")).show()

##isNull & isNotNull


In [None]:
df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

##like , rlike


In [None]:
df.select(df.fname,df.lname,df.id).filter(df.fname.like("%nn")).show()

##substr


In [None]:
#substr
df.select(df.fname.substr(1,2).alias("substr")).show()

##when & otherwise


In [None]:
from pyspark.sql.functions import when

df.show()
df.select(df.fname,df.lname,when(df.gender=="M","Male") \
              .when(df.gender=="F","Female") \
              .when(df.gender==None ,"") \
              .otherwise(df.gender).alias("new_gender") \
    ).show()

##isin


In [None]:
li=["100","200"]
df.select(df.fname,df.lname,df.id).filter(df.id.isin(li)).show()

---

In [None]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType

data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)
     ])
df=spark.createDataFrame(data,schema)
df.printSchema()
df.show()

In [None]:
#getItem()
df.select(df.languages.getItem(1)).show()

df.select(df.properties.getItem("hair")).show()

In [None]:
#getField from Struct or Map
df.select(df.properties.getField("hair")).show()

df.select(df.properties.getItem("hair")).show()

In [None]:
#getField from Struct or Map
df.select(df.properties.getField("hair")).show()

df.select(df.name.getField("fname")).show()

In [None]:
#dropFields
from pyspark.sql.functions import col

df.withColumn("name1",col("name").dropFields("fname")).show()

In [None]:
#withField
from pyspark.sql.functions import lit

df.withColumn("name",df.name.withField("fname",lit("AA"))).show()

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import lit

df = spark.createDataFrame([Row(a=Row(b=1, c=2))])
df.show()
df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import col, lit

df = spark.createDataFrame([
Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
df.show()
df.withColumn('a', df['a'].dropFields('b')).show()

---

#Column Operation

In [None]:
data=[("James",23),("Ann",40)]
df=spark.createDataFrame(data).toDF("name.fname","gender")
df.printSchema()
df.show()

In [None]:
from pyspark.sql.functions import col

df.select(col("`name.fname`")).show()
df.select(df["`name.fname`"]).show()
df.withColumn("new_col",col("`name.fname`").substr(1,2)).show()
df.filter(col("`name.fname`").startswith("J")).show()
new_cols=(column.replace('.', '_') for column in df.columns)
df2 = df.toDF(*new_cols)
df2.show()

## Using DataFrame object


In [None]:
df.select(df.gender).show()
df.select(df["gender"]).show()

##Accessing column name with dot (with backticks)


In [None]:
df.select(df["`name.fname`"]).show()

##Using SQL col() function


In [None]:
from pyspark.sql.functions import col
df.select(col("gender")).show()

#Accessing column name with dot (with backticks)
df.select(col("`name.fname`")).show()

##Access struct column


In [None]:
data=[Row(name="James",prop=Row(hair="black",eye="blue")),
      Row(name="Ann",prop=Row(hair="grey",eye="black"))]
df=spark.createDataFrame(data)
df.printSchema()
df.show()

In [None]:
df.select(df.prop.hair).show()
df.select(df.name,df.prop.hair).show()

df.select(df["prop.hair"]).show()
df.select(col("prop.hair")).show()

df.select(col("prop.*")).show()
df.select(df.name,col("prop.*")).show()

## Column operators


In [None]:
data=[(100,2,1),(200,3,4),(300,4,4)]
df=spark.createDataFrame(data).toDF("col1","col2","col3")
df.show()

df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show() 
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()

In [None]:
df.show()
df.select(df.col2 > df.col3).show()
df.select(df.col2 < df.col3).show()
df.select(df.col2 == df.col3).show()