In [1]:
import pyspark
print(pyspark.__version__)

4.0.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.functions import col, sum as _sum, avg, count, month, year

In [4]:
# Create Spark session
spark = SparkSession.builder \
.appName("CustomerTransactionAnalysis") \
.getOrCreate()

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Load CSV") \
    .getOrCreate()

df = spark.read.csv(r"Downloads\Sales Dataset.csv", header=True, inferSchema=True)
df.show()


+---+----------+------+---+----------------+--------+--------------+------------+
|_c0|      Date|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|
+---+----------+------+---+----------------+--------+--------------+------------+
|  0|2023-11-24|  Male| 34|          Beauty|       3|            50|         150|
|  1|2023-02-27|Female| 26|        Clothing|       2|           500|        1000|
|  2|2023-01-13|  Male| 50|     Electronics|       1|            30|          30|
|  3|2023-05-21|  Male| 37|        Clothing|       1|           500|         500|
|  4|2023-05-06|  Male| 30|          Beauty|       2|            50|         100|
|  5|2023-04-25|Female| 45|          Beauty|       1|            30|          30|
|  6|2023-03-13|  Male| 46|        Clothing|       2|            25|          50|
|  7|2023-02-22|  Male| 30|     Electronics|       4|            25|         100|
|  8|2023-12-13|  Male| 63|     Electronics|       2|           300|         600|
|  9|2023-10-07|

In [14]:
import pandas as pd

In [20]:
df.take(5)

[Row(_c0=0, Date=datetime.date(2023, 11, 24), Gender='Male', Age=34, Product Category='Beauty', Quantity=3, Price per Unit=50, Total Amount=150),
 Row(_c0=1, Date=datetime.date(2023, 2, 27), Gender='Female', Age=26, Product Category='Clothing', Quantity=2, Price per Unit=500, Total Amount=1000),
 Row(_c0=2, Date=datetime.date(2023, 1, 13), Gender='Male', Age=50, Product Category='Electronics', Quantity=1, Price per Unit=30, Total Amount=30),
 Row(_c0=3, Date=datetime.date(2023, 5, 21), Gender='Male', Age=37, Product Category='Clothing', Quantity=1, Price per Unit=500, Total Amount=500),
 Row(_c0=4, Date=datetime.date(2023, 5, 6), Gender='Male', Age=30, Product Category='Beauty', Quantity=2, Price per Unit=50, Total Amount=100)]

In [28]:

columns = ["Product Category", "Total Amount"]

In [33]:
from pyspark.sql.functions import sum

In [34]:
df.agg(sum("Total Amount").alias("Total Revenue")).show()

+-------------+
|Total Revenue|
+-------------+
|       456000|
+-------------+



In [35]:
from pyspark.sql.functions import avg

In [36]:
df.agg(avg("Total Amount").alias("Avg Transaction Value")).show()

+---------------------+
|Avg Transaction Value|
+---------------------+
|                456.0|
+---------------------+



In [37]:
df.groupBy("Gender") \
  .agg(sum("Total Amount").alias("Total Spent")) \
  .orderBy("Total Spent", ascending=False) \
  .show()

+------+-----------+
|Gender|Total Spent|
+------+-----------+
|Female|     232840|
|  Male|     223160|
+------+-----------+



In [38]:
from pyspark.sql.functions import when

df = df.withColumn(
    "Age Group",
    when(df.Age < 25, "<25")
    .when((df.Age >= 25) & (df.Age < 40), "25-39")
    .when((df.Age >= 40) & (df.Age < 60), "40-59")
    .otherwise("60+")
)

df.groupBy("Age Group") \
  .agg(sum("Total Amount").alias("Total Spent")) \
  .orderBy("Total Spent", ascending=False) \
  .show()


+---------+-----------+
|Age Group|Total Spent|
+---------+-----------+
|    40-59|     191705|
|    25-39|     144830|
|      <25|      74650|
|      60+|      44815|
+---------+-----------+



In [39]:
from pyspark.sql.functions import count

df.groupBy("Product Category") \
  .agg(count("*").alias("Transaction Count")) \
  .orderBy("Transaction Count", ascending=False) \
  .show()


+----------------+-----------------+
|Product Category|Transaction Count|
+----------------+-----------------+
|        Clothing|              351|
|     Electronics|              342|
|          Beauty|              307|
+----------------+-----------------+



In [40]:
from pyspark.sql.functions import month

df = df.withColumn("Month", month("Date"))

df.groupBy("Month") \
  .agg(sum("Total Amount").alias("Monthly Sales")) \
  .orderBy("Month") \
  .show()


+-----+-------------+
|Month|Monthly Sales|
+-----+-------------+
|    1|        36980|
|    2|        44060|
|    3|        28990|
|    4|        33870|
|    5|        53150|
|    6|        36715|
|    7|        35465|
|    8|        36960|
|    9|        23620|
|   10|        46580|
|   11|        34920|
|   12|        44690|
+-----+-------------+

