In [None]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 5)

In [None]:
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""

from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [4]:
data = [('James','Smith','M',3000),
  ('Anna','Rose','F',4100),
  ('Robert2','Williams','M',None), 
  ('Robert','Williams','M',6200), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()


                                                                                

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|  Robert2|Williams|     M|  null|
|   Robert|Williams|     M|  6200|
+---------+--------+------+------+



In [None]:
if 'salary1' not in df.columns:
    print("aa")

In [None]:
# Add new constanct column
from pyspark.sql.functions import lit
df.withColumn("bonus_percent", lit(0.3)).show()

In [None]:
#Add column from existing column
df.withColumn("bonus_amount", df.salary*0.3).show()

In [None]:
#Add column by concatinating existing columns
from pyspark.sql.functions import concat_ws
df.withColumn("name", concat_ws(",","firstname",'lastname')).show()

In [None]:
#Add current date
from pyspark.sql.functions import current_date
df.withColumn("current_date", current_date()).show()


In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import current_date

df.withColumn("current_date", current_date()).\
     withColumn("grade", \
          when((df.salary < 4000), lit("A")).\
          when((df.salary >= 4000) & (df.salary <= 5000), lit("B")).\
          otherwise(lit("C"))).show()

In [None]:
spark.catalog.dropTempView("df_view")

In [None]:
# ★ "when" function
from pyspark.sql.functions import when
df.withColumn("grade", \
   when((df.salary < 4000), lit("A")) \
     .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")).otherwise(lit("C"))).show()

In [None]:
df.createOrReplaceTempView("df_view")
spark.sql("select * from df_view").\
     withColumn("current_date", current_date()).\
     withColumn("grade", \
          when((df.salary < 4000), lit("A")).\
          when((df.salary >= 4000) & (df.salary <= 5000), lit("B")).\
          otherwise(lit("C"))).show()

In [5]:
# NULL check 
df = df.filter(df.salary.isNotNull())

In [9]:
# ★ "when" function to UDF 
# CASE 1 

from pyspark.sql.functions import udf
from pyspark.sql.types import *

def salaryToGrade(value):
   if   value < 4000 : return 'A'
   elif value >= 4000 and value <= 5000 : return 'B'
   else: return 'C'

udfsalaryToGrade = udf(salaryToGrade, StringType())
df_with_grade = df.withColumn("grade", udfsalaryToGrade("salary"))

df_with_grade.show()

+---------+--------+------+------+-----+
|firstname|lastname|gender|salary|grade|
+---------+--------+------+------+-----+
|    James|   Smith|     M|  3000|    A|
|     Anna|    Rose|     F|  4100|    B|
|   Robert|Williams|     M|  6200|    C|
+---------+--------+------+------+-----+



In [10]:
# ★ "when" function to UDF 
# CASE 2 

from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import current_date

def salaryToGrade(value):
   if   value < 4000 : return 'A'
   elif value >= 4000 and value <= 5000 : return 'B'
   else: return 'C'

udfsalaryToGrade = udf(salaryToGrade, StringType())
# df_with_grade = df.withColumn("grade", udfsalaryToGrade("salary"))


df.createOrReplaceTempView("df_view")
spark.sql("select * from df_view").\
     withColumn("current_date", current_date()).\
     withColumn("grade", udfsalaryToGrade("salary")).show()


# df_with_grade.show()

+---------+--------+------+------+------------+-----+
|firstname|lastname|gender|salary|current_date|grade|
+---------+--------+------+------+------------+-----+
|    James|   Smith|     M|  3000|  2022-03-14|    A|
|     Anna|    Rose|     F|  4100|  2022-03-14|    B|
|   Robert|Williams|     M|  6200|  2022-03-14|    C|
+---------+--------+------+------+------------+-----+



In [None]:
# Add column using select
df.select("firstname","salary", lit(0.3).alias("bonus")).show()
df.select("firstname","salary", lit(df.salary * 0.3).alias("bonus_amount")).show()
df.select("firstname","salary", current_date().alias("today_date")).show()

#Add columns using SQL
df.createOrReplaceTempView("PER")
spark.sql("select firstname,salary, '0.3' as bonus from PER").show()
spark.sql("select firstname,salary, salary * 0.3 as bonus_amount from PER").show()
spark.sql("select firstname,salary, current_date() as today_date from PER").show()
spark.sql("select firstname,salary, " +
          "case salary when salary < 4000 then 'A' "+
          "else 'B' END as grade from PER").show()