## Working with Different Data Types

Examples taken from [Spark: The definitive Guide](https://github.com/databricks/Spark-The-Definitive-Guide)

## Imports and Initializing Spark Session

In [1]:
# Imports

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
# Initializing SparkSession

spark = SparkSession.builder.appName("Working with Differents Data Types")\
.master("local[*]").getOrCreate()
sc = spark.sparkContext

## Loading Data

In [3]:
## Load Data

df = spark.read.csv("../data/retail.csv", header=True, inferSchema=True)

In [4]:
# Show data

df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [5]:
# Show Schema

df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



## Numeric Operations: `stat`

In [6]:
# `describe`

df.describe().select("summary", "Quantity", "Unitprice").show()

+-------+------------------+------------------+
|summary|          Quantity|         Unitprice|
+-------+------------------+------------------+
|  count|              3108|              3108|
|   mean| 8.627413127413128| 4.151946589446603|
| stddev|26.371821677029203|15.638659854603892|
|    min|               -24|               0.0|
|    max|               600|            607.49|
+-------+------------------+------------------+



In [7]:
# stat.corr

df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551

In [8]:
# stat.approxQuantile

df.stat.approxQuantile("UnitPrice", [0.25, 0.50, 0.75], 0.005)

[1.45, 2.51, 4.25]

In [9]:
# stat.crosstab

df.limit(3).stat.crosstab("Quantity", "UnitPrice").show()

+------------------+----+----+----+
|Quantity_UnitPrice|2.55|2.75|3.39|
+------------------+----+----+----+
|                 8|   0|   1|   0|
|                 6|   1|   0|   1|
+------------------+----+----+----+



In [10]:
# stat.freqItems

df.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



## String Operations: `regexp_replace`, `regexp_extract`

In [11]:
# regexp_replace:
# in the column Description, substitute all the words "BLACK", "WHITE",
# "RED", "GREEN", "BLUE" by "COLOR"

# Define the regex string
regex_string="BLACK|WHITE|RED|GREEN|BLUE"

# Apply the transformation
df.select(F.regexp_replace(F.col("Description"), regex_string, "COLOR").alias("color_clean"), 
          F.col("Description")).show(truncate=False)

+-----------------------------------+-----------------------------------+
|color_clean                        |Description                        |
+-----------------------------------+-----------------------------------+
|COLOR HANGING HEART T-LIGHT HOLDER |WHITE HANGING HEART T-LIGHT HOLDER |
|COLOR METAL LANTERN                |WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|KNITTED UNION FLAG HOT WATER BOTTLE|
|COLOR WOOLLY HOTTIE COLOR HEART.   |RED WOOLLY HOTTIE WHITE HEART.     |
|SET 7 BABUSHKA NESTING BOXES       |SET 7 BABUSHKA NESTING BOXES       |
|GLASS STAR FROSTED T-LIGHT HOLDER  |GLASS STAR FROSTED T-LIGHT HOLDER  |
|HAND WARMER UNION JACK             |HAND WARMER UNION JACK             |
|HAND WARMER COLOR POLKA DOT        |HAND WARMER RED POLKA DOT          |
|ASSORTED COLOUR BIRD ORNAMENT      |ASSORTED COLOUR BIRD ORNAMENT      |
|POPPY'S PLAYHOUSE BEDROOM          |P

In [12]:
# regexp_extract:
# in the column Description, extract the first occurrence of the 
# the words "BLACK", "WHITE", "RED", "GREEN", "BLUE".

# Define the regex string
regex_string="(BLACK|WHITE|RED|GREEN|BLUE)"

# Apply the transformation
df.select(F.regexp_extract(F.col("Description"), regex_string, 1).alias("color_clean"), 
          F.col("Description")).show(truncate=False)

+-----------+-----------------------------------+
|color_clean|Description                        |
+-----------+-----------------------------------+
|WHITE      |WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE      |WHITE METAL LANTERN                |
|           |CREAM CUPID HEARTS COAT HANGER     |
|           |KNITTED UNION FLAG HOT WATER BOTTLE|
|RED        |RED WOOLLY HOTTIE WHITE HEART.     |
|           |SET 7 BABUSHKA NESTING BOXES       |
|           |GLASS STAR FROSTED T-LIGHT HOLDER  |
|           |HAND WARMER UNION JACK             |
|RED        |HAND WARMER RED POLKA DOT          |
|           |ASSORTED COLOUR BIRD ORNAMENT      |
|           |POPPY'S PLAYHOUSE BEDROOM          |
|           |POPPY'S PLAYHOUSE KITCHEN          |
|           |FELTCRAFT PRINCESS CHARLOTTE DOLL  |
|           |IVORY KNITTED MUG COSY             |
|           |BOX OF 6 ASSORTED COLOUR TEASPOONS |
|           |BOX OF VINTAGE JIGSAW BLOCKS       |
|           |BOX OF VINTAGE ALPHABET BLOCKS     |


## Dates Operation

In [13]:
# Create a dataframe with dates and timestamps 
# using F.current_date and F.current_timestamp

date_df = spark.range(1).withColumn("today", F.current_date())\
                        .withColumn("now", F.current_timestamp())

In [14]:
date_df.show(truncate=False)

+---+----------+-----------------------+
|id |today     |now                    |
+---+----------+-----------------------+
|0  |2019-01-13|2019-01-13 13:09:25.201|
+---+----------+-----------------------+



In [15]:
# Creating a dataframe with dates and timestamps
# using F.to_date and F.to_timestamp

date_format = "yyyy-dd-MM"
timestamp_format = "yyyy/MM/dd HH:mm:ss"

date_df_v2 = spark.range(1).withColumn("date", F.to_date(F.lit("2019-15-03"), date_format))\
                           .withColumn("timestamp", F.to_timestamp(F.lit("2019/03/15 18:23:59"), 
                                                                   timestamp_format))

date_df_v2.show()

+---+----------+-------------------+
| id|      date|          timestamp|
+---+----------+-------------------+
|  0|2019-03-15|2019-03-15 18:23:59|
+---+----------+-------------------+



In [16]:
# using F.date_sub, F.datediff, F.months_between

date_df_v2.withColumn("week_ago", F.date_sub(F.col("date"), 7))\
          .withColumn("days_diff", F.datediff(F.col("date"), F.col("week_ago")))\
          .withColumn("months_diff", F.months_between(F.col("date"), F.col("week_ago"))).show()

+---+----------+-------------------+----------+---------+-----------+
| id|      date|          timestamp|  week_ago|days_diff|months_diff|
+---+----------+-------------------+----------+---------+-----------+
|  0|2019-03-15|2019-03-15 18:23:59|2019-03-08|        7| 0.22580645|
+---+----------+-------------------+----------+---------+-----------+



## Arrays Operations

In [17]:
# Create data with arrays in some columns

df_array = spark.createDataFrame(sc.parallelize([(1, ["a", "j", "k"], "a j k"),
                                                 (2, ["t", "y", "w"], "t y w"),
                                                 (3, ["p", "u", "a"], "p u a")]), 
                                 ["id", "letters", "words"])

In [18]:
df_array.show()

+---+---------+-----+
| id|  letters|words|
+---+---------+-----+
|  1|[a, j, k]|a j k|
|  2|[t, y, w]|t y w|
|  3|[p, u, a]|p u a|
+---+---------+-----+



In [19]:
# Using F.array_contains

df_array.filter(F.array_contains(F.col("letters"), "a")).show()

+---+---------+-----+
| id|  letters|words|
+---+---------+-----+
|  1|[a, j, k]|a j k|
|  3|[p, u, a]|p u a|
+---+---------+-----+



In [20]:
## Using F.split

df_array.withColumn("letters_v2", F.split(F.col("words"), " ")).show()

+---+---------+-----+----------+
| id|  letters|words|letters_v2|
+---+---------+-----+----------+
|  1|[a, j, k]|a j k| [a, j, k]|
|  2|[t, y, w]|t y w| [t, y, w]|
|  3|[p, u, a]|p u a| [p, u, a]|
+---+---------+-----+----------+



In [21]:
## Using F.explode

df_array.select(F.explode(F.col("letters"))).show()

+---+
|col|
+---+
|  a|
|  j|
|  k|
|  t|
|  y|
|  w|
|  p|
|  u|
|  a|
+---+

