# 7. WORKING WITH DIFFERENT TYPES OF DATA

Spark also allows us to create the following groupings types:

- The simplest grouping is to just summarize a complete DataFrame by performing an aggregation in a select statement.
- A “group by” allows you to specify one or more keys as well as one or more aggregation functions to transform the value columns.
- A “window” gives you the ability to specify one or more keys as well as one or more aggregation functions to transform the value columns. However, the rows input to the function are somehow related to the current row.
- A “grouping set,” which you can use to aggregate at multiple different levels. Grouping sets are available as a primitive in SQL and via rollups and cubes in DataFrames.
- A “rollup” makes it possible for you to specify one or more keys as well as one or more aggregation functions to transform the value columns, which will be summarized hierarchically.
- A “cube” allows you to specify one or more keys as well as one or more aggregation functions to transform the value columns, which will be summarized across all combinations of columns.

## SET UP

In [1]:
!pip install findspark

import findspark
findspark.init()



In [2]:
# Cargar Pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Test_spark").master("local[*]").getOrCreate()

spark

In [3]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("all.csv")\
    .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

AnalysisException: Path does not exist: file:/C:/Users/sobando/Downloads/SPARK_PRACTICE/all.csv;

In [None]:
df.count() == 541909

## Aggregation Functions

### count

In [None]:
# in Python
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909

In [None]:
spark.sql("SELECT COUNT(*) FROM dfTable").show()

### countDistinct

In [None]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070

In [None]:
spark.sql("SELECT COUNT(DISTINCT *) FROM DFTABLE").show()

### approx_count_distinct

In [None]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364

In [None]:
spark.sql("SELECT approx_count_distinct(StockCode, 0.1) FROM DFTABLE").show()

### first and last

In [None]:
from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()

In [None]:
spark.sql("SELECT first(StockCode), last(StockCode) FROM dfTable").show()

### min and max

In [None]:
from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()

In [None]:
spark.sql("SELECT min(Quantity), max(Quantity) FROM dfTable").show()

### sum

In [None]:
from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450

In [None]:
spark.sql("SELECT sum(Quantity) FROM dfTable").show()

### sumDistinct

In [None]:
from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310

In [None]:
spark.sql("SELECT SUM(Quantity) FROM dfTable -- 29310").show()

### avg

In [None]:
from pyspark.sql.functions import sum, count, avg, expr
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
.selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

### Variance and Standard Deviation

In [None]:
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show()

In [None]:
script = "SELECT var_pop(Quantity), var_samp(Quantity), " \
"stddev_pop(Quantity), stddev_samp(Quantity) " \
"FROM dfTable"

In [None]:
spark.sql(script).show()

### skewness and kurtosis

In [None]:
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

In [None]:
spark.sql("SELECT skewness(Quantity), kurtosis(Quantity) FROM dfTable").show()

### Covariance and Correlation

In [None]:
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
covar_pop("InvoiceNo", "Quantity")).show()

In [None]:
script = "SELECT corr(InvoiceNo, Quantity), covar_samp(InvoiceNo, Quantity), " \
    "covar_pop(InvoiceNo, Quantity) " \
"FROM dfTable"

In [None]:
spark.sql(script).show()

### Aggregating to Complex Types

In [None]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

In [None]:
spark.sql("SELECT collect_set(Country), collect_set(Country) FROM dfTable").show()

## Grouping 

In [None]:
df.groupBy("InvoiceNo", "CustomerId").count().show()

### Grouping with Expressions

In [None]:
from pyspark.sql.functions import count
df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()

### Grouping with Maps

In [None]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)")).show()

In [None]:
script = "SELECT avg(Quantity), stddev_pop(Quantity), InvoiceNo FROM dfTable " \
"GROUP BY InvoiceNo"

In [None]:
spark.sql(script).show()

## Window Functions***

In [None]:
from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
    .partitionBy("CustomerId", "date")\
    .orderBy(desc("Quantity"))\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [None]:
from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [None]:
from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [None]:
from pyspark.sql.functions import col
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
    .select(
        col("CustomerId"),
        col("date"),
        col("Quantity"),
        purchaseRank.alias("quantityRank"),
        purchaseDenseRank.alias("quantityDenseRank"),
        maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

In [None]:
script = "SELECT CustomerId, date, Quantity, " \
"rank(Quantity) OVER (PARTITION BY CustomerId, date " \
"ORDER BY Quantity DESC NULLS LAST " \
"ROWS BETWEEN " \
"UNBOUNDED PRECEDING AND " \
"CURRENT ROW) as rank, " \
"dense_rank(Quantity) OVER (PARTITION BY CustomerId, date " \
"ORDER BY Quantity DESC NULLS LAST " \
"ROWS BETWEEN " \
"UNBOUNDED PRECEDING AND " \
"CURRENT ROW) as dRank, " \
"max(Quantity) OVER (PARTITION BY CustomerId, date " \
"ORDER BY Quantity DESC NULLS LAST " \
"ROWS BETWEEN " \
"UNBOUNDED PRECEDING AND " \
"CURRENT ROW) as maxPurchase " \
"FROM dfWithDate WHERE CustomerId IS NOT NULL ORDER BY CustomerId " 

In [None]:
spark.sql(script).show()

## Grouping Sets

In [None]:
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [None]:
script = "SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull " \
"GROUP BY customerId, stockCode " \
"ORDER BY CustomerId DESC, stockCode DESC"

In [None]:
spark.sql(script).show()

In [None]:
script = "SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull " \
"GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode)) " \
"ORDER BY CustomerId DESC, stockCode DESC"

In [None]:
spark.sql(script).show()

In [None]:
script = "SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull " \
"GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode),()) " \
"ORDER BY CustomerId DESC, stockCode DESC"

In [None]:
spark.sql(script).show()

## Rollups **

In [None]:
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))\
    .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
    .orderBy("Date")
rolledUpDF.show()

In [None]:
rolledUpDF.where("Country IS NULL").show()
rolledUpDF.where("Date IS NULL").show()

## Cube **

In [None]:
from pyspark.sql.functions import sum
dfNoNull.cube("Date", "Country").agg(sum(col("Quantity")))\
.select("Date", "Country", "sum(Quantity)").orderBy("Date").show()

## Grouping Metadata [Not Python Equivalent]

## Pivot

In [None]:
pivoted = dfWithDate.groupBy("date").pivot("Country").sum()

In [None]:
pivoted.where("date > '2011-12-05'").select("date" ,"USA_sum(Quantity)").show()

## User-Defined Aggregation Functions [Not Python Equivalent]