In [36]:
from pyspark.sql import SparkSession

In [37]:
session = SparkSession. \
    builder. \
    appName("Car Sales Analysis"). \
    getOrCreate()

In [38]:
car_sales = session. \
    read. \
    csv('car_sales_2024.csv', 
        header=True, 
        inferSchema=True
    )

In [39]:
car_sales.count()

5000

In [40]:
car_sales.show()

+------------+----------+-----------+---------+---------+--------+--------+------+-------+----------+--------------+----------------+-----------------+-----+----------+
|     sale_id| sale_date|customer_id| car_make|car_model|car_year|category| color|mileage|sale_price|payment_method|      dealership|      salesperson|state|commission|
+------------+----------+-----------+---------+---------+--------+--------+------+-------+----------+--------------+----------------+-----------------+-----+----------+
|SALE20244500|2024-01-01|  CUST14500|    Honda|   Accord|    2023|   Sedan| Black|     16|  29518.87|         Lease|   Westside Auto|   David Anderson|   FL|     850.2|
|SALE20241760|2024-01-01|  CUST11760|      BMW| 3 Series|    2024|  Luxury| Black|     35|  47624.88|       Finance|North Point Cars|     Mary Jackson|   IL|   1012.44|
|SALE20241772|2024-01-01|  CUST11772|    Mazda|     CX-5|    2024|     SUV| White|      8|  31947.14|       Finance| Downtown Motors|      Emily Davis|   P

In [41]:
# Get the sales for a given state
state = input("Enter the state: ")
state_sales = car_sales.filter(f"state == '{state}'")

In [42]:
state_sales.count()

538

In [43]:
state_sales.show()

+------------+----------+-----------+---------+---------+--------+--------+-----+-------+----------+--------------+----------------+-----------------+-----+----------+
|     sale_id| sale_date|customer_id| car_make|car_model|car_year|category|color|mileage|sale_price|payment_method|      dealership|      salesperson|state|commission|
+------------+----------+-----------+---------+---------+--------+--------+-----+-------+----------+--------------+----------------+-----------------+-----+----------+
|SALE20244500|2024-01-01|  CUST14500|    Honda|   Accord|    2023|   Sedan|Black|     16|  29518.87|         Lease|   Westside Auto|   David Anderson|   FL|     850.2|
|SALE20243303|2024-01-01|  CUST13303|    Tesla|  Model 3|    2023|Electric|Black|     10|  43115.02|          Cash|North Point Cars|Jennifer Martinez|   FL|   1546.21|
|SALE20240803|2024-01-06|  CUST10803|    Honda|    Civic|    2024|   Sedan|Green|     35|  24368.51|       Finance|North Point Cars|    Sarah Johnson|   FL|   1

In [49]:
from pyspark.sql.functions import sum, count, round, cast, col

In [50]:
# Get sales by state
sales_by_state = car_sales. \
    withColumn('sale_price', cast('float', col('sale_price'))). \
    groupBy('state'). \
    agg(
        round(sum('sale_price'), 2).alias('total_sales'), \
        count('sale_price').alias('sales_count')
    ). \
    orderBy(col('total_sales').desc())

In [51]:
sales_by_state.count()

10

In [53]:
# Convert to Pandas for better formatting
sales_by_state.toPandas()

Unnamed: 0,state,total_sales,sales_count
0,FL,20220854.18,538
1,IL,20205300.48,512
2,OH,19920681.73,524
3,NY,19062197.61,508
4,GA,18921434.95,505
5,CA,18727709.11,509
6,TX,18309337.33,489
7,MI,18264746.87,498
8,PA,17866692.35,480
9,NC,16101036.37,437


In [55]:
sales_by_state. \
    orderBy(col('total_sales').desc()). \
    toPandas()

Unnamed: 0,state,total_sales,sales_count
0,FL,20220854.18,538
1,IL,20205300.48,512
2,OH,19920681.73,524
3,NY,19062197.61,508
4,GA,18921434.95,505
5,CA,18727709.11,509
6,TX,18309337.33,489
7,MI,18264746.87,498
8,PA,17866692.35,480
9,NC,16101036.37,437
