In [1]:
val sparkSession=new SparkSession.Builder().appName("cachingSession").getOrCreate()

org.apache.spark.sql.SparkSession@2f1fc088

In [2]:
//generate sales 1 million sales data
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types._
import scala.util.Random

val generateSalesData:()=>Unit=()=>{
    
    val numRecords = 1000000

    // Defining my product names 
    val products = List("Laptop", "Smartphone", "Headphones", "Tablet", "Smartwatch", "Camera", "Monitor", "Keyboard", "Mouse", "Charger")

    // Function to generate a random amount between 100 and 5000
    def generateAmount(): Double = {
      Random.nextDouble() * (5000 - 100) + 100
    }

    // Generate the sales data
    val salesData = (1 to numRecords).map { i =>
      val product = products(Random.nextInt(products.length))  // Random product from list
      val amount = generateAmount()  // Random amount between 100 and 5000
      Row(product, amount)
    }

    // Defining the schema for the DataFrame
    val schema = StructType(Array(
      StructField("product", StringType, nullable = false),
      StructField("amount", DoubleType, nullable = false)
    ))

    // Create DataFrame from the generated data
    val rdd = spark.sparkContext.parallelize(salesData)
    val salesDf = spark.createDataFrame(rdd, schema)

    // Show a few records to verify
    salesDf.show(10)

    // Set the GCS path to save the data
    val gcsPath = "gs://second_task/sales_data.csv"

    // Save the DataFrame to GCS as a CSV file
    salesDf.write
      .option("header", "true")
      .mode("overwrite")
      .csv(gcsPath)

    println(s"Sales data with $numRecords records saved to GCS at: $gcsPath")

}

generateSalesData = > Unit = $Lambda$3639/0x00000001016e3840@35e1f525


> Unit = $Lambda$3639/0x00000001016e3840@35e1f525

In [4]:
generateSalesData()

Waiting for a Spark session to start...

+----------+------------------+
|   product|            amount|
+----------+------------------+
|Smartwatch| 1372.398573126282|
|    Camera| 4374.367246266835|
|Smartphone|1860.9097116876646|
|Smartwatch|1417.0745114967203|
|    Camera| 2218.338553853016|
|  Keyboard| 333.2453527960917|
|   Monitor| 203.3578444663059|
|   Monitor|2019.4350518977963|
|    Tablet|1163.3974702137764|
|    Camera| 4101.909263286847|
+----------+------------------+
only showing top 10 rows

Sales data with 1000000 records saved to GCS at: gs://second_task/sales_data.csv


In [5]:
//performing the task
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import System.nanoTime

val salesData = sparkSession.read.option("header", "true").option("inferSchema", "true")
  .csv("gs://second_task/sales_data.csv")  // Replace with your actual dataset path

// Example transformation: filtering and aggregation
def processSalesData(df: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
  df.filter(col("amount") > 1000)  // Filter high-value sales
    .groupBy("product")
    .agg(sum("amount").alias("totalAmount"))
    .orderBy(desc("totalAmount"))
}

// Measure time without caching
val startTimeWithoutCaching = nanoTime()
val resultWithoutCaching = processSalesData(salesData)
resultWithoutCaching.show(10)  // Show the result (or use .collect() for triggering computation)
val endTimeWithoutCaching = nanoTime()

val durationWithoutCaching = (endTimeWithoutCaching - startTimeWithoutCaching) / 1e9
println(s"Execution time without caching: $durationWithoutCaching seconds")

// Cache the DataFrame to speed up the subsequent transformations
val cachedSalesData = salesData.cache()

// Measure time with caching
val startTimeWithCaching = nanoTime()
val resultWithCaching = processSalesData(cachedSalesData)
resultWithCaching.show(10)  // Show the result
val endTimeWithCaching = nanoTime()

val durationWithCaching = (endTimeWithCaching - startTimeWithCaching) / 1e9
println(s"Execution time with caching: $durationWithCaching seconds")


+----------+--------------------+
|   product|         totalAmount|
+----------+--------------------+
|     Mouse|2.4634770598579547E8|
|    Laptop| 2.459179666256859E8|
|  Keyboard| 2.454247193088084E8|
|Smartwatch|2.4539874304433495E8|
|Smartphone|2.4523561869283563E8|
|   Charger|2.4504842921766156E8|
|    Camera|2.4460501392812604E8|
|    Tablet|2.4420538858668232E8|
|Headphones|2.4384700520827448E8|
|   Monitor| 2.438245565751474E8|
+----------+--------------------+

Execution time without caching: 8.560165248 seconds
+----------+--------------------+
|   product|         totalAmount|
+----------+--------------------+
|     Mouse|2.4634770598579547E8|
|    Laptop| 2.459179666256859E8|
|  Keyboard| 2.454247193088084E8|
|Smartwatch|2.4539874304433495E8|
|Smartphone|2.4523561869283563E8|
|   Charger|2.4504842921766156E8|
|    Camera|2.4460501392812604E8|
|    Tablet|2.4420538858668232E8|
|Headphones|2.4384700520827448E8|
|   Monitor| 2.438245565751474E8|
+----------+-----------------

salesData = [product: string, amount: double]
startTimeWithoutCaching = 4657110144643
resultWithoutCaching = [product: string, totalAmount: double]
endTimeWithoutCaching = 4665670309891
durationWithoutCaching = 8.560165248
cachedSalesData = [product: string, amount: double]
startTimeWithCaching = 4665721311104
resultWithCaching = [product: string, totalAmount: double]
endTimeWithCaching = 4668749917519
durationWithCaching = 3.028606415


processSalesData: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


3.028606415

In [6]:
sparkSession.stop()