### First Code

In [1]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder \
    .appName("Day1-Simple Hands-on") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("INFO")
print("Spark UI URL:", spark.sparkContext.uiWebUrl)


Spark UI URL: http://DESKTOP-KRET721:4040


In [118]:
csv_path = r"E:\pyspark-training\data\small\online_retail.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)

print("Row count:", df.count())
df.show(5)
df.printSchema()


Row count: 541909
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showi

In [None]:
# To know the count
df.count()
# to know the first 10 line
df.show(10)
# Print Specific column
df.select ("InvoiceNo","StockCode").show()
# Rename columns
df.select ("InvoiceNo","StockCode","Quantity").withColumnRenamed("InvoiceNo","InvoiceNumber").withColumnRenamed("Quantity","Count").show(5)
# Multiple Column Renamed style
df.select ("InvoiceNo","StockCode","Quantity").withColumnsRenamed({"InvoiceNo":"InvoiceNumber"}).show(3)
# Case like , length, cast in one go
from pyspark.sql.functions import col, when, length
from pyspark.sql.types import IntegerType

df.select("InvoiceNo", "StockCode", "Quantity" , "Description","CustomerID") \
    .withColumn("Number", 
        when((col("Quantity") > 5) & (col("Quantity") <= 7), 'Large') \
        .when(col("Quantity") > 7, "Very Large") \
        .when(col("Quantity") < 5, "Less") \
        .otherwise("NA")) \
    .withColumn("LengthDescription", length(col("Description"))) \
    .withColumn("CustomerID", col("CustomerID").cast(IntegerType())) \
    .withColumnsRenamed({"InvoiceNo": "InvoiceNumber"}) \
    .show(15)





In [None]:
# AND and OR Condition
df.select(col("*")).filter((col("InvoiceNo") == "536365") & (col("Quantity") == 6)).show()


from pyspark.sql.functions import lit
df.select\
    (
       col("InvoiceNo"),\
       col("StockCode"),\
       col("Description"),\
       col("UnitPrice").alias("CHAHA")\
       
    )\
    .withColumn("MyNewColumn",lit("KAKA"))\
    .filter\
    (\
        col("InvoiceNo") == "536365"\
    ).show(20)\



In [None]:
from pyspark.sql.functions import sum, avg, min, max

# -- Group BY / Aggregation
df.groupBy(col("Country"),col("CustomerID")).agg(
    sum("Quantity").alias("Total_Quantity"),
    avg("UnitPrice").alias("Average_UnitPrice"),
    min("UnitPrice").alias("Min_UnitPrice"),
    max("UnitPrice").alias("Max_UnitPrice")
).show()


In [None]:


# -- Please share distinct country count and then list
df.select(col("Country"),col("CustomerID")).distinct().show()
df.select(col("Country")).distinct().count()


# -- Creating data frame from count and Union / UnionALL
distinct_country_cust_count_dataframe = df.select(col("Country"),col("CustomerID")).distinct().count()
distinct_country_cust_count_dataframe = spark.createDataFrame([('Dual', distinct_country_cust_count_dataframe)], ['Metric', 'Count'])

distinct_country_count_dataframe = df.select(col("Country")).distinct().count()
distinct_country_count_dataframe = spark.createDataFrame([('Country', distinct_country_count_dataframe)], ['Metric', 'Count'])

final_df = distinct_country_cust_count_dataframe.unionAll(distinct_country_count_dataframe)
final_df.show()






In [None]:
# Hands on Task 1 :Count total rows, distinct invoices, distinct customers
# Count Total Row
df.count()
# Distinct Invoive
df.select(col("InvoiceNo")).distinct().show()
# Distinct Customer
df.select(col("CustomerID")).distinct().show()

In [None]:
# Task 2 : Filter high-value orders (TotalPrice > 100000)
from pyspark.sql.functions import desc;
df.groupBy(col("CustomerID"))\
    .agg(sum(col("UnitPrice") * col("Quantity")).alias("TotalPrice"))\
    .filter((col("TotalPrice") > "100000") & (col("CustomerID").isNotNull() ))\
    .sort(col("TotalPrice").desc())\
    .show()


In [None]:
# Task 3 : Group the data by Country and compute total quantity sold and average unit price per country.
from pyspark.sql.functions import sum,avg,col
df.groupBy(col("Country")).agg(
    sum(col("Quantity")).alias("CountryWiseQuantityCount"), 
    avg(col("UnitPrice")).alias("CountryWiseAvgUnitPrice"))\
    .show()

In [None]:
# Task 4: -Add a new column TotalPrice = UnitPrice * Quantity to the DataFrame.
df.select(col("*")).withColumn("TotalPrice",(col("UnitPrice") * col("Quantity"))).show(5)

In [None]:
#Task 5 : Create a new column HighValue using a when condition: HighValue = "YES" if TotalPrice > 20 Otherwise HighValue = "NO"

df.select(col("*"))\
    .withColumn("HighValue",when((col("UnitPrice") * col("Quantity")) > "20" , "YES").otherwise("NO"))\
    .show(5)

In [None]:
# Task 6 : Sort the DataFrame by Quantity in descending order and show top 10 rows
df.select(col("*"))\
    .sort(col("Quantity").desc())\
    .show(10)

In [None]:
# Task 7 : Convert InvoiceDate to timestamp and extract:
from pyspark.sql.functions import to_timestamp,dayofmonth,month,year
df.withColumn(\
    "InvoiceDateTS",\
    to_timestamp(col("InvoiceDate")))\
    .withColumn("DAY_OF_MONTH",dayofmonth(to_timestamp(col("InvoiceDate"))))\
    .withColumn("MONTH",month(to_timestamp(col("InvoiceDate"))))\
    .withColumn("YEAR",year(to_timestamp(col("InvoiceDate"))))\
        .show()


In [122]:
## Task 8: Write a single PySpark query that does the following:

# 1. Add a column TotalPrice = UnitPrice * Quantity
# 2. Add a column HighValue = "YES" if TotalPrice > 20, else "NO"
# 3. Convert InvoiceDate to timestamp and extract Year, Month, Day
# 5. Filter only rows where Quantity > 5
# 6. Group by Country and HighValue flag:
# 7. Compute sum(TotalPrice) as TotalRevenue
# 8. Compute avg(UnitPrice) as AvgUnitPrice
# 9. Sort the final result by TotalRevenue descending

from pyspark.sql.functions import round
final_df = df.withColumn("TotalPrice", (col("UnitPrice") * col("Quantity")))\
    .withColumn("HighValue" , when(col("TotalPrice") > 20 , "YES").otherwise("NO"))\
    .withColumn("DAY" , dayofmonth(to_timestamp(col("InvoiceDate"))))\
    .withColumn("MONTH" , month(to_timestamp(col("InvoiceDate"))))\
    .withColumn("YEAR" , year(to_timestamp(col("InvoiceDate"))))\
    .filter(col("Quantity") > 5)\
    .groupBy(col("Country"), col("HighValue")).agg(\
        sum(round(col("TotalPrice"),2)).alias("TotalRevenue")\
       ,avg(col("UnitPrice")).alias("AvgUnitPrice")\
    )\
    .sort(col("TotalRevenue").desc())

final_df.show()

final_df.explain(True)

+--------------+---------+------------------+------------------+
|       Country|HighValue|      TotalRevenue|      AvgUnitPrice|
+--------------+---------+------------------+------------------+
|United Kingdom|      YES| 5086314.069999987|2.9566691711599886|
|United Kingdom|       NO|1381472.0400000298|1.2539407911132832|
|   Netherlands|      YES| 278939.5899999998|2.1741276595744683|
|          EIRE|      YES|186201.80999999994| 3.172618006993011|
|     Australia|      YES|132982.75999999998| 2.575443349753696|
|       Germany|      YES|118887.29000000027|3.2335112847222267|
|        France|      YES|105018.14000000001|2.8237074057246745|
|       Germany|       NO| 64332.64000000005|1.3796503496503552|
|        France|       NO| 55515.54000000013|1.3886212745335613|
|          EIRE|       NO| 47580.98000000002|1.3661680092059876|
|         Japan|      YES|36984.840000000004|2.2230705394190875|
|   Switzerland|      YES|35760.659999999996|3.1687823439878238|
|        Sweden|      YES

In [2]:
# Stop Spark Session
spark.stop()


In [121]:
spark.sparkContext.getConf().getAll()


[('spark.app.submitTime', '1766661581283'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.driver.port', '49650'),
 ('spark.executor.id', 'dr

In [120]:
spark.sparkContext.defaultParallelism


4

In [119]:
final_df.rdd.getNumPartitions()


1