In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import time

In [2]:
import os
os.environ["PYSPARK_PYTHON"] = "python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python"

In [3]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession
    .builder
    .appName("Testing testing")
    .master("spark://192.168.1.15:7077")        #ip address of master
    .config("spark.hadoop.hadoop.native.io", "false")
    .getOrCreate()
)

In [5]:
#spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [6]:
# spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [7]:
print(os.environ.get("PYSPARK_SUBMIT_ARGS"))
print(os.environ.get("SPARK_HOME"))

None
C:\Users\Dell\spark-4.0.0-bin-hadoop3


### SKEW

In [None]:
transactions_file = "../../data/transactions.parquet"
customers_file = "../../data/customers.parquet"

df_transactions = spark.read.parquet(transactions_file)
df_customers = spark.read.parquet(customers_file)

In [12]:
df_transactions.printSchema()
df_transactions.show(5, False)

df_customers.printSchema()
df_customers.show(5, False)

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- city: string (nullable = true)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city       |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42 |boston     |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34 |portland   |
|C0YDPQWPBJ|2010-07-01|201

In [13]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)  #disable broadcast join

In [14]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how="inner"
    )
)

In [15]:
#simple join and then count
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")

39790092

time taken: 34.85749077796936


In [16]:
#avg txn amt with skewed dataset (no salting)

avg_txn_amt = (
    df_txn_details
    .groupBy("cust_id")
    .agg(F.avg("amt").alias("avg_txn_amt"))
    .orderBy(F.desc("avg_txn_amt"))
)

In [17]:
avg_txn_amt.show(5, truncate=False)

+----------+------------------+
|cust_id   |avg_txn_amt       |
+----------+------------------+
|CRBRTDCWB5|274.74398429833127|
|CA9UYOQ5DA|257.0569479285439 |
|CQYO6YFE5T|256.6091433189658 |
|CGN9VRRD9S|254.3261152684778 |
|CMWM4NK1DP|253.45855328620036|
+----------+------------------+
only showing top 5 rows


### SALTING

In [18]:
SALT_NUM = 3

salt_transactions = df_transactions.withColumn("salt", (F.rand() * SALT_NUM).cast("int"))

#salt_transactions.show(10, False)

salt_transactions = salt_transactions.select("cust_id", "txn_id", "expense_type", "amt", "city","salt")
salt_transactions.show(5, False)

+----------+---------------+-------------+------+-----------+----+
|cust_id   |txn_id         |expense_type |amt   |city       |salt|
+----------+---------------+-------------+------+-----------+----+
|C0YDPQWPBJ|TZ5SMKZY9S03OQJ|Entertainment|10.42 |boston     |2   |
|C0YDPQWPBJ|TYIAPPNU066CJ5R|Motor/Travel |44.34 |portland   |0   |
|C0YDPQWPBJ|TETSXIK4BLXHJ6W|Entertainment|3.18  |chicago    |1   |
|C0YDPQWPBJ|TQKL1QFJY3EM8LO|Groceries    |268.97|los_angeles|1   |
|C0YDPQWPBJ|TYL6DFP09PPXMVB|Entertainment|2.66  |chicago    |0   |
+----------+---------------+-------------+------+-----------+----+
only showing top 5 rows


In [19]:
salt_customers = (
    df_customers
    .withColumn("salt_values", F.array([F.lit(i) for i in range(SALT_NUM)]))
    .withColumn("salt", F.explode(F.col("salt_values")))
    .select("cust_id", "age", "gender", "salt", "salt_values")
)

salt_customers.show(5, False)

+----------+---+------+----+-----------+
|cust_id   |age|gender|salt|salt_values|
+----------+---+------+----+-----------+
|C007YEYTX9|34 |Female|0   |[0, 1, 2]  |
|C007YEYTX9|34 |Female|1   |[0, 1, 2]  |
|C007YEYTX9|34 |Female|2   |[0, 1, 2]  |
|C00B971T1J|37 |Female|0   |[0, 1, 2]  |
|C00B971T1J|37 |Female|1   |[0, 1, 2]  |
+----------+---+------+----+-----------+
only showing top 5 rows


In [20]:
#salted join with avg_txn_amt

df_salted_join = salt_transactions.join(
    salt_customers,
    ["cust_id", "salt"],
    how="inner"
)

In [21]:
#salted join and then count
start_time = time.time()
df_salted_join.count()
print(f"time taken: {time.time() - start_time}")

39790092

time taken: 27.87901759147644


In [22]:
#avg txn amt

avg_salt_txn_amt = (
    df_salted_join
    .groupBy("cust_id")
    .agg(F.avg("amt").alias("avg_txn_amt"))
    .orderBy(F.desc("avg_txn_amt"))
)

#simple join and then count
start_time = time.time()
avg_salt_txn_amt.show(5, False)
print(f"time taken: {time.time() - start_time}")

+----------+------------------+
|cust_id   |avg_txn_amt       |
+----------+------------------+
|CRBRTDCWB5|274.74398429833195|
|CA9UYOQ5DA|257.0569479285442 |
|CQYO6YFE5T|256.6091433189656 |
|CGN9VRRD9S|254.32611526847816|
|CMWM4NK1DP|253.45855328620053|
+----------+------------------+
only showing top 5 rows
time taken: 77.9297685623169


In [23]:
#spark.stop()