# C6 : 6. Working with Different Types of Data



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('c6').getOrCreate()
!netstat -anp |grep 4040 |grep LISTEN # check if the session is created and login to Spark console in localhost:4040 

tcp6       0      0 :::4040                 :::*                    LISTEN      21257/java          


In [2]:
# Read CSV 
df = spark.read.load(path = "/root/golive/Spark-The-Definitive-Guide/data/retail-data/by-day/2010-12-01.csv"\
                     ,format = "csv"\
                     ,header = "True"\
                     ,inferschema = "True"
                     )

In [3]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [4]:
df.createOrReplaceTempView('T1')


In [5]:
df.show(2) 

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [6]:
sql = " select * from T1 Limit 2"
spark.sql(sql).show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+



In [7]:
# Literals lit function 
from pyspark.sql.functions import lit , expr 
df.withColumn("new_col",lit(5)).show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|new_col|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|      5|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      5|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
only showing top 2 rows



In [8]:
df.withColumn("new_col",lit("some random String")).show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|           new_col|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|some random String|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|some random String|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
only showing top 2 rows



In [9]:
#https://intellipaat.com/community/12495/pyspark-withcolumn-with-two-conditions-and-three-outcomes
from pyspark.sql.functions import col, expr, when

newcolumn1 = expr("""IF (Quantity > 2 and UnitPrice > 3 , 0 ,100)""")
df.withColumn("new_col",newcolumn1).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|new_col|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|    100|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      0|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|    100|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      0|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      0|
+---------+---------+--------------------+--------+-------------------+-

In [10]:
newcolumn2 = when(col("Quantity") > 2 ,0 ).when(col("UnitPrice") >3 ,0).otherwise(100)
#newcolumn3 = when(col("Quantity") > 2  &  col("UnitPrice") > 3 ,0).otherwise(100)
df.withColumn("new_col",newcolumn2 ).show(2)


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|new_col|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|      0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      0|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-------+
only showing top 2 rows



# BOOLEAN 

In [11]:
df.select(col("InvoiceNo"), col("Description"))\
.where(col("InvoiceNo") == 536365)\
.show(5,False)



+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [12]:
#SELECT * FROM dfTable WHERE StockCode in (" DOT") AND( UnitPrice > 600 OR instr( Description, "POSTAGE") > = 1)

sql = 'select * from t1 where StockCode in ("DOT") and (UnitPrice > 600 or Description like "%POSTAGE%")'
spark.sql(sql).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [13]:
from pyspark.sql.functions import instr 

filter1= col("UnitPrice") > 600 
filter2= instr(col("Description"),"POSTAGE") >= 1 

df.where(col("StockCode") == "DOT")\
.where (filter1 | filter2).show()
#.where (col("UnitPrice") > 600 |  instr(col("Description") ,"POSTAGE" )  >= 1 )

        
        
#priceFilter = col("UnitPrice") > 600 
#descripFilter = instr( df.Description, "POSTAGE") >= 1 
#df.where( df.StockCode.isin(" DOT")).where( priceFilter | descripFilter).show()



+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [14]:
# Numerical calculation  

sql = "select  CustomerID, power(quantity* UnitPrice ,2) as RealQuant from t1 "
spark.sql(sql).show(2)


+----------+------------------+
|CustomerID|         RealQuant|
+----------+------------------+
|   17850.0|234.08999999999997|
|   17850.0|          413.7156|
+----------+------------------+
only showing top 2 rows



In [15]:
from pyspark.sql.functions import pow

RealQuant = pow(col("Quantity") * col("UnitPrice"),2)
df.withColumn("RealQuant",RealQuant).select("CustomerID",RealQuant).show(2)

+----------+----------------------------------+
|CustomerID|POWER((Quantity * UnitPrice), 2.0)|
+----------+----------------------------------+
|   17850.0|                234.08999999999997|
|   17850.0|                          413.7156|
+----------+----------------------------------+
only showing top 2 rows



In [16]:
# Rounding Integer 
from pyspark.sql.functions import round, bround

df.select(col("UnitPrice"), round(col("UnitPrice"))).show(10)

+---------+-------------------+
|UnitPrice|round(UnitPrice, 0)|
+---------+-------------------+
|     2.55|                3.0|
|     3.39|                3.0|
|     2.75|                3.0|
|     3.39|                3.0|
|     3.39|                3.0|
|     7.65|                8.0|
|     4.25|                4.0|
|     1.85|                2.0|
|     1.85|                2.0|
|     1.69|                2.0|
+---------+-------------------+
only showing top 10 rows



In [17]:
df.select(col("UnitPrice"), bround(col("UnitPrice"))).show(10)

+---------+--------------------+
|UnitPrice|bround(UnitPrice, 0)|
+---------+--------------------+
|     2.55|                 3.0|
|     3.39|                 3.0|
|     2.75|                 3.0|
|     3.39|                 3.0|
|     3.39|                 3.0|
|     7.65|                 8.0|
|     4.25|                 4.0|
|     1.85|                 2.0|
|     1.85|                 2.0|
|     1.69|                 2.0|
+---------+--------------------+
only showing top 10 rows



In [18]:
from pyspark.sql.functions import monotonically_increasing_id,max,mean,least,stddev,rank
df.select(mean(col("UnitPrice"))).show()

+-----------------+
|   avg(UnitPrice)|
+-----------------+
|4.151946589446603|
+-----------------+



# Strings Manupulation 

In [19]:
# Task 1 : All srings upper case 

sql = "select Country from t1"
sql = "select upper(Country) from t1"
spark.sql(sql).show(3)


+--------------+
|upper(Country)|
+--------------+
|UNITED KINGDOM|
|UNITED KINGDOM|
|UNITED KINGDOM|
+--------------+
only showing top 3 rows



In [20]:
# you need upper - Look for it in funtions 
from pyspark.sql.functions import upper
df.select(upper (col("country"))).show(3)

# NOTES 
# upper and initcap - same functionality 
# lower 

+--------------+
|upper(country)|
+--------------+
|UNITED KINGDOM|
|UNITED KINGDOM|
|UNITED KINGDOM|
+--------------+
only showing top 3 rows



In [21]:
# string functions 

sql = "SELECT ltrim(' HELLLOOOO ')\
,rtrim(' HELLLOOOO ')\
,trim(' HELLLOOOO ')\
,lpad(' HELLOOOO ', 20, '*')\
,rpad(' HELLOOOO ', 20, '*')\
FROM t1"

spark.sql(sql).show(1)





+------------------+------------------+-----------------+-----------------------+-----------------------+
|ltrim( HELLLOOOO )|rtrim( HELLLOOOO )|trim( HELLLOOOO )|lpad( HELLOOOO , 20, *)|rpad( HELLOOOO , 20, *)|
+------------------+------------------+-----------------+-----------------------+-----------------------+
|        HELLLOOOO |         HELLLOOOO|        HELLLOOOO|   ********** HELLOOOO |    HELLOOOO **********|
+------------------+------------------+-----------------+-----------------------+-----------------------+
only showing top 1 row



In [22]:
# String functions regex_replace regex_extract 
# Example regex_replace 
from pyspark.sql.functions import regexp_replace , regexp_extract

df.withColumn("new_desc" ,regexp_replace(col("Description"),"RED|BLACK|WHITE|GREEN|BLUE","COLOR"))\
.select(col("Description"),col("new_desc"))\
.show(10,False)

+-----------------------------------+-----------------------------------+
|Description                        |new_desc                           |
+-----------------------------------+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |COLOR HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |COLOR METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |COLOR WOOLLY HOTTIE COLOR HEART.   |
|SET 7 BABUSHKA NESTING BOXES       |SET 7 BABUSHKA NESTING BOXES       |
|GLASS STAR FROSTED T-LIGHT HOLDER  |GLASS STAR FROSTED T-LIGHT HOLDER  |
|HAND WARMER UNION JACK             |HAND WARMER UNION JACK             |
|HAND WARMER RED POLKA DOT          |HAND WARMER COLOR POLKA DOT        |
|ASSORTED COLOUR BIRD ORNAMENT      |ASSORTED COLOUR BIRD ORNAMENT      |
+-----------------------------------+-

In [23]:
# Function :translate 
# Literal Translation from one word to another . Good for data cleaning in case of special charecters 
# eg ABCD to 1234 (A-->1 , B--> 2 , C--> 3 and D-->4)
from pyspark.sql.functions import translate 

sql = " select Description , translate (Description ,'ABCD','1234') as new_translate from t1"
spark.sql(sql).show(2,False)


+----------------------------------+----------------------------------+
|Description                       |new_translate                     |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|WHITE H1NGING HE1RT T-LIGHT HOL4ER|
|WHITE METAL LANTERN               |WHITE MET1L L1NTERN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



In [24]:
df.withColumn("new_translate",translate (col("Description"),'ABCD','1234'))\
.select(col("Description"),col("new_translate"))\
.show(2,False)

+----------------------------------+----------------------------------+
|Description                       |new_translate                     |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|WHITE H1NGING HE1RT T-LIGHT HOL4ER|
|WHITE METAL LANTERN               |WHITE MET1L L1NTERN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



In [25]:
# Function : regexp_extract 
# E.g - Extract the first occurance of a string 
sql = "select Description ,regexp_extract(Description,'(WHITE|BLACK|RED|GREEN)',1) as new_extract from t1 "
df_ext = spark.sql(sql)
df_ext.printSchema()
df_ext.show(5)


root
 |-- Description: string (nullable = true)
 |-- new_extract: string (nullable = true)

+--------------------+-----------+
|         Description|new_extract|
+--------------------+-----------+
|WHITE HANGING HEA...|      WHITE|
| WHITE METAL LANTERN|      WHITE|
|CREAM CUPID HEART...|           |
|KNITTED UNION FLA...|           |
|RED WOOLLY HOTTIE...|        RED|
+--------------------+-----------+
only showing top 5 rows



In [26]:
extract_string = "(WHITE|BLACK|RED|GREEN)"
df.withColumn("new_extract"\
              ,regexp_extract(col("Description"),extract_string,1))\
              .select(col("Description"),col("new_extract")).show(5)                                                  

+--------------------+-----------+
|         Description|new_extract|
+--------------------+-----------+
|WHITE HANGING HEA...|      WHITE|
| WHITE METAL LANTERN|      WHITE|
|CREAM CUPID HEART...|           |
|KNITTED UNION FLA...|           |
|RED WOOLLY HOTTIE...|        RED|
+--------------------+-----------+
only showing top 5 rows



In [27]:
# Function : instr 
# Requirement to check if string exists
#instr returns the location of the word in the string (19 below represents the 19th word in the line)

sql = "select Description , instr(Description,'WHITE') as is_white from t1"
spark.sql(sql).show(5,False)

sql = "select Description , instr(Description,'WHITE') >=1  as is_white from t1"
spark.sql(sql).show(5,False)

+-----------------------------------+--------+
|Description                        |is_white|
+-----------------------------------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER |1       |
|WHITE METAL LANTERN                |1       |
|CREAM CUPID HEARTS COAT HANGER     |0       |
|KNITTED UNION FLAG HOT WATER BOTTLE|0       |
|RED WOOLLY HOTTIE WHITE HEART.     |19      |
+-----------------------------------+--------+
only showing top 5 rows

+-----------------------------------+--------+
|Description                        |is_white|
+-----------------------------------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER |true    |
|WHITE METAL LANTERN                |true    |
|CREAM CUPID HEARTS COAT HANGER     |false   |
|KNITTED UNION FLAG HOT WATER BOTTLE|false   |
|RED WOOLLY HOTTIE WHITE HEART.     |true    |
+-----------------------------------+--------+
only showing top 5 rows



In [28]:
df.selectExpr("Description","(instr(Description,'WHITE') >=1) as is_white").show(5)

+--------------------+--------+
|         Description|is_white|
+--------------------+--------+
|WHITE HANGING HEA...|    true|
| WHITE METAL LANTERN|    true|
|CREAM CUPID HEART...|   false|
|KNITTED UNION FLA...|   false|
|RED WOOLLY HOTTIE...|    true|
+--------------------+--------+
only showing top 5 rows



In [29]:
from pyspark.sql.functions import expr, locate 
simpleColors = [" black", "white", "red", "green", "blue"]

def color_locator( column, color_string): 
    return locate( color_string.upper(), column).cast("boolean").alias("is_" + color_string)

selectedColumns = [color_locator( df.Description, c) for c in simpleColors] 
selectedColumns.append( expr("*")) # has to a be Column type 

df.select(*selectedColumns).show(2)
df.select(*selectedColumns).where(expr("is_white or is_red")).select("Description").show(3,False)





+---------+--------+------+--------+-------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|is_ black|is_white|is_red|is_green|is_blue|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+--------+------+--------+-------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|    false|    true| false|   false|  false|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|    false|    true| false|   false|  false|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+--------+------+--------+-------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

+----------------------------------+
|Description                    

# Date and TIMESTAMPS 

In [30]:
from pyspark.sql.functions import current_date,current_timestamp
date_df=spark.range(10)\
.withColumn("date" ,current_date())\
.withColumn("time",current_timestamp())


In [31]:
date_df.createOrReplaceTempView("date_table")

In [32]:
# ADD and Subtract dates 
sql = "select date_add(date ,5) as future_date , date_sub(date,5) as past_date from date_table"
spark.sql(sql).show(5)

+-----------+----------+
|future_date| past_date|
+-----------+----------+
| 2020-08-14|2020-08-04|
| 2020-08-14|2020-08-04|
| 2020-08-14|2020-08-04|
| 2020-08-14|2020-08-04|
| 2020-08-14|2020-08-04|
+-----------+----------+
only showing top 5 rows



In [33]:
from pyspark.sql.functions import date_add,date_sub
date_df.withColumn("future_date",date_add(col("date") ,5))\
       .withColumn("past_date",date_sub(col("date"),5))\
       .select(col("future_date"),col("past_date"))\
       .show(2,False)

+-----------+----------+
|future_date|past_date |
+-----------+----------+
|2020-08-14 |2020-08-04|
|2020-08-14 |2020-08-04|
+-----------+----------+
only showing top 2 rows



In [34]:
from pyspark.sql.functions import datediff ,months_between,to_date

sql = "select to_date('2020-06-25') as today_Date ,datediff('2020-06-25','2019-06-25') as date_diff  , months_between('2020-06-25','2019-06-25') as months_bet from date_table "
spark.sql(sql).show(1)

+----------+---------+----------+
|today_Date|date_diff|months_bet|
+----------+---------+----------+
|2020-06-25|      366|      12.0|
+----------+---------+----------+
only showing top 1 row



In [35]:
date_df.withColumn("today_Date",to_date(lit("2020-06-25")))\
       .withColumn("date_diff",datediff(to_date(lit("2020-06-25")) , to_date(lit("2019-06-25"))))\
       .withColumn("months_bet", months_between(to_date(lit("2020-06-25")) , to_date(lit("2019-06-25"))))\
       .show(1,False)
    

+---+----------+-----------------------+----------+---------+----------+
|id |date      |time                   |today_Date|date_diff|months_bet|
+---+----------+-----------------------+----------+---------+----------+
|0  |2020-08-09|2020-08-09 18:09:26.102|2020-06-25|366      |12.0      |
+---+----------+-----------------------+----------+---------+----------+
only showing top 1 row



In [36]:
# To ensure Data format is correct define Dataformat 
DATEFORMAT = 'yyyy-dd-MM'   # only the month is in capitals - Spark gave an error when i typed YYYY insted of yyyy


In [37]:
date_df.withColumn("today_Date", to_date(lit("2020-25-06"),DATEFORMAT))\
       .withColumn("test_date" , to_date(lit("2020-06-25"),DATEFORMAT))\
       .show(2)
       
# TEST_DATE Column returns null - wrong format      

+---+----------+--------------------+----------+---------+
| id|      date|                time|today_Date|test_date|
+---+----------+--------------------+----------+---------+
|  0|2020-08-09|2020-08-09 18:09:...|2020-06-25|     null|
|  1|2020-08-09|2020-08-09 18:09:...|2020-06-25|     null|
+---+----------+--------------------+----------+---------+
only showing top 2 rows



 # Handling NULL    
 # Spark dosen't enforce null constraint - Needs to be handled. 

In [38]:
 #Functions :
    # Coalesce - Returns first not null 
    # ifnull - Returns the second , if the first value is null  
    # nullif - Returns NULL if two values are equal 
    # nvl - Returns second value if first is null 
    # nvl2 - Returns first not null --> second . First value == null , 3rd Value 
    
sql = """ select 
coalesce( null , 1 ) coalesce_1,  coalesce( null , null, 2 ) coalesce_2 ,
ifnull(null , 1) ifnull_1 ,  ifnull(2 , 3) ifnull_2,
nullif(1,1) nullif_1 , nullif(1,2) nullif_2, 
nvl (null , 1) nvl1 , nvl(2,null ) nvl2,
 nvl2('x' ,1,2 ) nvl2_1, nvl2(null , 1,2) nvl2_2
from t1 """
spark.sql(sql).show(1)
    

+----------+----------+--------+--------+--------+--------+----+----+------+------+
|coalesce_1|coalesce_2|ifnull_1|ifnull_2|nullif_1|nullif_2|nvl1|nvl2|nvl2_1|nvl2_2|
+----------+----------+--------+--------+--------+--------+----+----+------+------+
|         1|         2|       1|       2|    null|       1|   1|   2|     1|     2|
+----------+----------+--------+--------+--------+--------+----+----+------+------+
only showing top 1 row



In [39]:
df.na.drop() # drops the row if any of the column has null value  --> Defaults to df.na.drop('any')
df.na.drop('all') # drops the row only if all values are null 

# sql 
sql = "select * from t1 where description is not null " # needs to handled at individual column in where clause "
list_col = ['StockCode','InvoiceNo']
df.na.drop("all",subset = list_col).show(2)


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [40]:
df.na.fill(5, subset = list_col) # for the subset columns all null values will be filled with value 5 

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

In [41]:
df.show(2)
df.orderBy(col("InvoiceNo").desc()).show(2)
df.orderBy(col("InvoiceNo").asc(), col("Quantity").desc()).show(2)
df.orderBy(col("InvoiceNo").asc(), col("Quantity").desc()).show(2)
df.orderBy(col("Description").asc_nulls_first(), col("Quantity").desc()).show(2)
df.orderBy(col("Description").desc_nulls_first(), col("Quantity").desc()).show(2)
df.orderBy(col("Description").desc_nulls_last(), col("Quantity").desc()).show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

+---------+---------+--------------------+--------+-------------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+-------+
|  C536548|    22244|3 H

# COMPLEX DATA TYPES 

In [42]:
# STRUCT TYPE : Dataframe inside Dataframe 
from pyspark.sql.functions import struct 
struct_df = df.select(struct(col("InvoiceNo"), col("StockCode")).alias("struct_col"))
struct_df.show(3)


+----------------+
|      struct_col|
+----------------+
|[536365, 85123A]|
| [536365, 71053]|
|[536365, 84406B]|
+----------------+
only showing top 3 rows



In [43]:
struct_df.select(col("struct_col.InvoiceNo")).show(2)

+---------+
|InvoiceNo|
+---------+
|   536365|
|   536365|
+---------+
only showing top 2 rows



In [44]:
# function split 

sql = """select split(Description,' ') from t1"""
spark.sql(sql ).show(2)

#split will breakdown into list of arrays 

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



In [45]:
sql = """select split(Description,' ')[0] from t1"""
spark.sql(sql ).show(2)

+----------------------------+
|split(Description,  , -1)[0]|
+----------------------------+
|                       WHITE|
|                       WHITE|
+----------------------------+
only showing top 2 rows



In [46]:
from pyspark.sql.functions import split 
df.select(split(col("Description"), " ")\
.alias("split_col"))\
.selectExpr("split_col[0]")\
.show(3)






+------------+
|split_col[0]|
+------------+
|       WHITE|
|       WHITE|
|       CREAM|
+------------+
only showing top 3 rows



In [47]:
# Functions for arrays - 
# 1. Size 
# 2. Array_contains 
from pyspark.sql.functions import size , array_contains 

sql = """ 
select size(split(Description , " "))  as size_split_arr from t1 
"""
spark.sql(sql).show(2)

sql = """
select array_contains(split(Description , " "), "WHITE") arr_contains from t1
"""
spark.sql(sql).show(3)

+--------------+
|size_split_arr|
+--------------+
|             5|
|             3|
+--------------+
only showing top 2 rows

+------------+
|arr_contains|
+------------+
|        true|
|        true|
|       false|
+------------+
only showing top 3 rows



In [48]:
df.select(array_contains(split(col("Description")," "),"WHITE")\
.alias("arr_contains"))\
.show(3)

+------------+
|arr_contains|
+------------+
|        true|
|        true|
|       false|
+------------+
only showing top 3 rows



In [49]:
# Function : exploded ( splits coolum with array data into indiviual rows . Other columns gets duplicated )
from pyspark.sql.functions import explode 
df.select(split(col("Description")," ")\
          .alias("splitColumn"))\
          .select(explode(col("splitColumn")))\
          .show(10)

+-------+
|    col|
+-------+
|  WHITE|
|HANGING|
|  HEART|
|T-LIGHT|
| HOLDER|
|  WHITE|
|  METAL|
|LANTERN|
|  CREAM|
|  CUPID|
+-------+
only showing top 10 rows



In [50]:
# https://docs.databricks.com/spark/latest/spark-sql/language-manual/select.html 
# for SQL syntax 

sql = """
SELECT Description, InvoiceNo, exploded 
FROM (SELECT *, split( Description, " ") as splitted 
      FROM t1) as v1 
      LATERAL VIEW explode(v1.splitted) as exploded
"""
spark.sql(sql).show(10)



+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
|WHITE HANGING HEA...|   536365|   HEART|
|WHITE HANGING HEA...|   536365| T-LIGHT|
|WHITE HANGING HEA...|   536365|  HOLDER|
| WHITE METAL LANTERN|   536365|   WHITE|
| WHITE METAL LANTERN|   536365|   METAL|
| WHITE METAL LANTERN|   536365| LANTERN|
|CREAM CUPID HEART...|   536365|   CREAM|
|CREAM CUPID HEART...|   536365|   CUPID|
+--------------------+---------+--------+
only showing top 10 rows



In [51]:
# function MAPS 
# https://medium.com/@mrpowers/working-with-spark-arraytype-and-maptype-columns-4d85f3c8b2b3 
    

In [52]:
# Working with JSON 

jsonDF = spark.range(1).selectExpr(""" '{" myJSONKey" : {" myJSONValue" : [1, 2, 3]}}' as jsonString""")
jsonDF.show(1, False )

from pyspark.sql.functions import get_json_object, json_tuple 
jsonDF.select( get_json_object( col("jsonString"), "$.myJSONKey.myJSONValue[0]").alias("column"), json_tuple( col("jsonString"), "myJSONKey")).show( 2)




+---------------------------------------------+
|jsonString                                   |
+---------------------------------------------+
|{" myJSONKey" : {" myJSONValue" : [1, 2, 3]}}|
+---------------------------------------------+

+------+----+
|column|  c0|
+------+----+
|  null|null|
+------+----+



# UDF 
    

In [53]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def function1(x):
    return x*x*x

df1 =spark.range(1,100)
df1.printSchema()

spark.udf.register("reg_f",function1,returnType=IntegerType())




root
 |-- id: long (nullable = false)



<function __main__.function1(x)>

In [54]:
df1.createOrReplaceTempView("t2")

In [55]:
sql = "select id from t2"
spark.sql(sql).show(5)

sql = "select reg_f(id) from t2 "
spark.sql(sql).show(5)

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+
only showing top 5 rows

+---------+
|reg_f(id)|
+---------+
|        1|
|        8|
|       27|
|       64|
|      125|
+---------+
only showing top 5 rows



In [56]:
data = [[1,2], [3,5],[4,4],[3,3],[2, None] ]
columns = ["a" ,"b"]
num_df = spark.createDataFrame(data,schema=columns)
num_df.show(5)

+---+----+
|  a|   b|
+---+----+
|  1|   2|
|  3|   5|
|  4|   4|
|  3|   3|
|  2|null|
+---+----+



In [57]:
from pyspark.sql.types import DoubleType
# python function 
def py_div(a,b):
    if a and b:
        return b/a 
    

spark.udf.register("spark_div",py_div , returnType=DoubleType())
    

<function __main__.py_div(a, b)>

In [58]:
#num_df.withColumn("newdivvalue",spark_div(col("a"),col("b")))
#num_df.select(spark_div(col("a"),col("b")))

num_df.createOrReplaceTempView("num_view")
sql = "select a, b , spark_div(a,b) from num_view"
spark.sql(sql).show(5)

+---+----+------------------+
|  a|   b|   spark_div(a, b)|
+---+----+------------------+
|  1|   2|               2.0|
|  3|   5|1.6666666666666667|
|  4|   4|               1.0|
|  3|   3|               1.0|
|  2|null|              null|
+---+----+------------------+



In [59]:
spark_div = udf(py_div,DoubleType())
num_df.withColumn("newdivvalue",spark_div(col("a"),col("b"))).show(5)

+---+----+------------------+
|  a|   b|       newdivvalue|
+---+----+------------------+
|  1|   2|               2.0|
|  3|   5|1.6666666666666667|
|  4|   4|               1.0|
|  3|   3|               1.0|
|  2|null|              null|
+---+----+------------------+



In [60]:
spark.stop()