# Ex-2070 - Columns and expressions


In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   8272      0 --:--:-- --:--:-- --:--:-- 46974
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
# 2a. Select only order-related columns (different method: using `selectExpr`)
df_orders = df.selectExpr("`Order ID`", "Date", "`Total Sales`", "`Payment Method`", "Status")
df_orders.show()

+--------+--------+-----------+--------------+---------+
|Order ID|    Date|Total Sales|Payment Method|   Status|
+--------+--------+-----------+--------------+---------+
| ORD0001|14-03-25|        180|    Debit Card|Cancelled|
| ORD0002|20-03-25|        400|    Debit Card|  Pending|
| ORD0003|15-02-25|        120|    Amazon Pay|Cancelled|
| ORD0004|19-02-25|        180|   Credit Card|  Pending|
| ORD0005|10-03-25|        450|    Debit Card|  Pending|
| ORD0006|14-03-25|         20|   Credit Card|  Pending|
| ORD0007|18-03-25|        600|        PayPal|Completed|
| ORD0008|02-03-25|        500|        PayPal|Completed|
| ORD0009|08-03-25|         60|        PayPal|Completed|
| ORD0010|12-03-25|        500|   Credit Card|Cancelled|
| ORD0011|17-02-25|         30|    Amazon Pay|  Pending|
| ORD0012|13-03-25|        160|   Credit Card|Completed|
| ORD0013|01-03-25|       1600|     Gift Card|  Pending|
| ORD0014|04-03-25|       1800|   Credit Card|Cancelled|
| ORD0015|20-02-25|        600|

In [6]:
# 2b. Select only product-related columns (different method: using `select`)
df_products = df.select("Category", "Product", "Price")
df_products.show()

+---------------+---------------+-----+
|       Category|        Product|Price|
+---------------+---------------+-----+
|       Footwear|  Running Shoes|   60|
|    Electronics|     Headphones|  100|
|       Footwear|  Running Shoes|   60|
|       Footwear|  Running Shoes|   60|
|    Electronics|     Smartwatch|  150|
|       Clothing|        T-Shirt|   20|
|    Electronics|     Smartwatch|  150|
|    Electronics|     Smartphone|  500|
|       Clothing|        T-Shirt|   20|
|    Electronics|     Smartphone|  500|
|          Books|           Book|   15|
|       Clothing|          Jeans|   40|
|    Electronics|         Laptop|  800|
|Home Appliances|Washing Machine|  600|
|    Electronics|     Smartwatch|  150|
|Home Appliances|   Refrigerator| 1200|
|       Clothing|        T-Shirt|   20|
|    Electronics|     Smartphone|  500|
|       Footwear|  Running Shoes|   60|
|    Electronics|     Headphones|  100|
+---------------+---------------+-----+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import col

# 2c. Select only customer-related columns (different method: using `df[col_name]`)
df_customers = df.select(col("Customer Name"), col("Customer Location"))
df_customers.show()

+-------------+-----------------+
|Customer Name|Customer Location|
+-------------+-----------------+
|   Emma Clark|         New York|
|Emily Johnson|    San Francisco|
|     John Doe|           Denver|
|Olivia Wilson|           Dallas|
|   Emma Clark|         New York|
|     John Doe|           Dallas|
|   Emma Clark|          Houston|
|Sophia Miller|            Miami|
|Sophia Miller|           Boston|
|Emily Johnson|    San Francisco|
|    David Lee|           Boston|
|Michael Brown|           Dallas|
|Daniel Harris|    San Francisco|
|Michael Brown|            Miami|
|     John Doe|          Seattle|
|     John Doe|           Boston|
|   Emma Clark|         New York|
|Michael Brown|      Los Angeles|
|Olivia Wilson|          Houston|
|Olivia Wilson|          Seattle|
+-------------+-----------------+
only showing top 20 rows



In [8]:
from pyspark.sql.functions import expr

# 3. Add a new column `NewPrice` with inflation adjustment (using `expr`)
df = df.withColumn("NewPrice", expr("Price * 1.01"))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|NewPrice|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+
| ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|   60.60|
| ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|  101.00|
| ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|   60.60|
| ORD0004|19-02-25|  Running Shoes|       Footwear|   60|       3|        180|Olivia Wilson|           Dallas|   Credit Card|  Pen

In [9]:
# 4. Calculate `NewTotalSales` as `NewPrice * Quantity`
df = df.withColumn("NewTotalSales", expr("NewPrice * Quantity"))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+-------------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|NewPrice|NewTotalSales|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+-------------+
| ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|   60.60|       181.80|
| ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|  101.00|       404.00|
| ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|   60.60|       121.20|
| ORD0004|19-02-25|  Running Shoes|       Foot

In [10]:
# 5. Rename `Order ID` to `OrderNumber`
df = df.withColumnRenamed("Order ID", "OrderNumber")
df.show()

+-----------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+-------------+
|OrderNumber|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|NewPrice|NewTotalSales|
+-----------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------+-------------+
|    ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|   60.60|       181.80|
|    ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|  101.00|       404.00|
|    ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|   60.60|       121.20|
|    ORD0004|19-02-25|  Runn