# Chapter 6

## Working with Different Types of Data

In [1]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("gs://reddys-data-for-experimenting/retail-data/by-day/2010-12-01.csv")

df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [2]:
df.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import col

df.where(col("InvoiceNo") == 536365) \
    .select("InvoiceNo", "StockCode") \
    .show(5)

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|   536365|   85123A|
|   536365|    71053|
|   536365|   84406B|
|   536365|   84029G|
|   536365|   84029E|
+---------+---------+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import corr

df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice").alias("QPCorrelation")).show()

+--------------------+
|       QPCorrelation|
+--------------------+
|-0.04112314436835551|
+--------------------+



In [12]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [15]:
cDf = df.drop("InvoiceNo") \
        .drop("Description")

cDf.describe().show()

+-------+------------------+------------------+------------------+------------------+--------------+
|summary|         StockCode|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+------------------+------------------+------------------+------------------+--------------+
|  count|              3108|              3108|              3108|              1968|          3108|
|   mean|27834.304044117645| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|17407.897548583845|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|             10002|               -24|               0.0|           12431.0|     Australia|
|    max|              POST|               600|            607.49|           18229.0|United Kingdom|
+-------+------------------+------------------+------------------+------------------+--------------+



### Using UDF's

In [17]:
udfDf = spark.range(20).toDF("num")

In [19]:
udfDf.show(5)

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [20]:
def power3(value):
    return value ** 3

power3(3)

27

In [21]:
from pyspark.sql.functions import udf

power3udf = udf(power3)

In [22]:
# using the udf now
from pyspark.sql.functions import col
udfDf.select(power3udf(col("num"))).show(3)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
+-----------+
only showing top 3 rows



Perfomence with UDF's will be much lower compared to Spark native functions

In [23]:
spark.stop()
sc.stop()