# Ex-2080 - Text expressions


In [None]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   4849      0 --:--:-- --:--:-- --:--:-- 22748
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [None]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [None]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [None]:
from pyspark.sql.functions import expr, upper, when, concat, substring

# 2. Add a new column `product symbol` combining 2 letters from `category` & 5 letters from `product`
df = df.withColumn("product_symbol", concat(substring(df["Category"], 1, 2), substring(df["Product"], 1, 5)))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|product_symbol|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+
| ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|       FoRunni|
| ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|       ElHeadp|
| ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|       FoRunni|
| ORD0004|19-02-25|  Running Shoes|       Footwear|   60|       3|        180|Olivia Wilson|  

In [None]:
# 3. Add `PriceClass` column based on price ranges
df = df.withColumn("PriceClass", when(df["Price"] > 100, "expensive")
                              .when(df["Price"] > 50, "standard")
                              .otherwise("cheap"))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|product_symbol|PriceClass|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
| ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|       FoRunni|  standard|
| ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|       ElHeadp|  standard|
| ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|       FoRunni|  standard|
| ORD0004|19-02-25|  Running

In [None]:
# 4. Convert `Category` values to uppercase
df = df.withColumn("Category", upper(df["Category"]))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|product_symbol|PriceClass|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
| ORD0001|14-03-25|  Running Shoes|       FOOTWEAR|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|       FoRunni|  standard|
| ORD0002|20-03-25|     Headphones|    ELECTRONICS|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|       ElHeadp|  standard|
| ORD0003|15-02-25|  Running Shoes|       FOOTWEAR|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|       FoRunni|  standard|
| ORD0004|19-02-25|  Running

In [None]:
# 5. Replace `Payment Method` values where applicable
df = df.withColumn("Payment Method", when((df["Payment Method"] == "Credit Card") | (df["Payment Method"] == "Debit Card"), "Bank")
                                    .otherwise(df["Payment Method"]))
df.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|product_symbol|PriceClass|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+--------------+----------+
| ORD0001|14-03-25|  Running Shoes|       FOOTWEAR|   60|       3|        180|   Emma Clark|         New York|          Bank|Cancelled|       FoRunni|  standard|
| ORD0002|20-03-25|     Headphones|    ELECTRONICS|  100|       4|        400|Emily Johnson|    San Francisco|          Bank|  Pending|       ElHeadp|  standard|
| ORD0003|15-02-25|  Running Shoes|       FOOTWEAR|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|       FoRunni|  standard|
| ORD0004|19-02-25|  Running

In [None]:
df_modified = df.select("product_symbol", "PriceClass", "Category", "Payment Method")
df_modified.show()

+--------------+----------+---------------+--------------+
|product_symbol|PriceClass|       Category|Payment Method|
+--------------+----------+---------------+--------------+
|       FoRunni|  standard|       FOOTWEAR|          Bank|
|       ElHeadp|  standard|    ELECTRONICS|          Bank|
|       FoRunni|  standard|       FOOTWEAR|    Amazon Pay|
|       FoRunni|  standard|       FOOTWEAR|          Bank|
|       ElSmart| expensive|    ELECTRONICS|          Bank|
|       ClT-Shi|     cheap|       CLOTHING|          Bank|
|       ElSmart| expensive|    ELECTRONICS|        PayPal|
|       ElSmart| expensive|    ELECTRONICS|        PayPal|
|       ClT-Shi|     cheap|       CLOTHING|        PayPal|
|       ElSmart| expensive|    ELECTRONICS|          Bank|
|        BoBook|     cheap|          BOOKS|    Amazon Pay|
|       ClJeans|     cheap|       CLOTHING|          Bank|
|       ElLapto| expensive|    ELECTRONICS|     Gift Card|
|       HoWashi| expensive|HOME APPLIANCES|          Ban