In [17]:
val spark_home = "/Users/fanzhenxin/bigData/spark-2.4.4-bin-hadoop2.7"
val data_path = spark_home+"/data"
val df_path = data_path+"/retail-data/by-day/2010-12-01.csv"

spark_home = /Users/fanzhenxin/bigData/spark-2.4.4-bin-hadoop2.7
data_path = /Users/fanzhenxin/bigData/spark-2.4.4-bin-hadoop2.7/data
df_path = /Users/fanzhenxin/bigData/spark-2.4.4-bin-hadoop2.7/data/retail-data/by-day/2010-12-01.csv


/Users/fanzhenxin/bigData/spark-2.4.4-bin-hadoop2.7/data/retail-data/by-day/2010-12-01.csv

In [19]:
val df = spark.read.format("csv")
    .option("header","true")
    .option("inferSchema","true")
    .load(df_path)
df.createOrReplaceTempView("dfTable")

df = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [21]:
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo").equalTo(536365))
  .select("InvoiceNo","Description")
  .show(5,false)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [22]:
df.where(col("InvoiceNo")===536365)
    .select("InvoiceNo","Description")
    .show(5,false)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [24]:
val DOTCodeFilter = col("StockCode")==="DOT"
val priceFilter = col("UnitPrice") > 600
val descriptFilter = col("Description").contains("POSTAGE")
df.withColumn("isExpensive",DOTCodeFilter.and(priceFilter.or(descriptFilter)))
.where("isExpensive")
.select("unitPrice","isExpensive")
.show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



DOTCodeFilter = (StockCode = DOT)
priceFilter = (UnitPrice > 600)
descriptFilter = contains(Description, POSTAGE)


contains(Description, POSTAGE)

In [25]:
import org.apache.spark.sql.functions.{expr,not,col}
df.withColumn("isExpensive",not(col("UnitPrice").leq(250)))
    .filter("isExpensive")
    .select("Description","UnitPrice")
    .show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [26]:
df.withColumn("isExpensive",expr("NOT UnitPrice <= 250"))
  .filter("isExpensive")
  .select("Description","UnitPrice")
  .show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



# 和数字相关的操作

In [27]:
df.columns

Array(InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country)

In [31]:
// power
import org.apache.spark.sql.functions.{expr,pow}
val fabricatedQuantity = pow(col("Quantity")*col("UnitPrice"),2)+5
df.select(expr("CustomerId"),fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



fabricatedQuantity = (POWER((Quantity * UnitPrice), 2.0) + 5)


(POWER((Quantity * UnitPrice), 2.0) + 5)

In [29]:
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity * UnitPrice),2.0)+5) as realQuantity"
).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [32]:
// round  逢5进位
// bround 逢5退位
import org.apache.spark.sql.functions.{round,bround}
df.select(round(col("UnitPrice"),1).alias("rounded"),col("UnitPrice"))
.show(5)

+-------+---------+
|rounded|UnitPrice|
+-------+---------+
|    2.6|     2.55|
|    3.4|     3.39|
|    2.8|     2.75|
|    3.4|     3.39|
|    3.4|     3.39|
+-------+---------+
only showing top 5 rows



In [35]:
import org.apache.spark.sql.functions.lit
df.select(round(lit("2.6")),bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.6, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [39]:
//计算相关性
import org.apache.spark.sql.functions.{corr}

df.stat.corr("Quantity","UnitPrice")
df.select(corr("Quantity","UnitPrice")).show()

-0.04112314436835551+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [40]:
val colName = "UnitPrice"
val quantileProbs = Array(0.5)
val relError = 0.05
df.stat.approxQuantile("UnitPrice",quantileProbs,relError)

colName = UnitPrice
quantileProbs = Array(0.5)
relError = 0.05


Array(2.51)

In [None]:
df.stat.crosstab("StockCode","Quantity").show(5)

In [42]:
import org.apache.spark.sql.functions.monotonically_increasing_id

df.select(monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows

