In [1]:
from __future__ import print_function
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType, DateType, DecimalType
from pyspark.sql.functions import desc
import pandas as pd
#import pyspark.sql.functions as f


In [2]:
spark = SparkSession\
    .builder\
    .appName("PavelTestSparkSQLAgg")\
    .getOrCreate()

Setting spark.hadoop.yarn.resourcemanager.principal to pnovokshonov


In [4]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/home/cdsw/resources/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

                                                                                

In [5]:
#Count StockCode or rowcount, count(*) will include nulls, this one doesn't
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909

[Stage 2:>                                                          (0 + 2) / 2]

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



                                                                                

In [6]:
#Count distinct StockCode
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070



+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



                                                                                

In [7]:
#Approx count, could be good when working with large data sets.
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 336

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [8]:
#First and last values
from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|           21544|         85049D|
+----------------+---------------+



In [9]:
#Min and Max
from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [10]:
#Sum of quantity
from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [11]:
#Sum Distinct
from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310



+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [12]:
#Averages
from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



In [13]:
#Variance and Std Dev
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+------------------+------------------+--------------------+---------------------+
| var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+------------------+------------------+--------------------+---------------------+
|47559.303646609354| 47559.39140929905|  218.08095663447864|   218.08115785023486|
+------------------+------------------+--------------------+---------------------+



In [14]:
#Grouping with Expressions
from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   563020|  24|             24|
|   565747|  10|             10|
|   566248|   8|              8|
|   566431|  18|             18|
|   567163|  14|             14|
|   567695|   1|              1|
|   567879|  29|             29|
|   568222|  11|             11|
|   568711|   4|              4|
|   569020|  48|             48|
|   569560|  16|             16|
|   569823|  69|             69|
|   570234|  36|             36|
|   570264|   1|              1|
|   570281|   3|              3|
|   570592|  73|             73|
|   571010|   1|              1|
|   571906|   1|              1|
|   572049|  20|             20|
|   572458|  26|             26|
+---------+----+---------------+
only showing top 20 rows

