In [1]:
val sparkSession=new SparkSession.Builder().appName("broadCastingSession").getOrCreate()

sparkSession = org.apache.spark.sql.SparkSession@48e4e607


org.apache.spark.sql.SparkSession@48e4e607

In [15]:

import org.apache.spark.sql.{SparkSession, Row}          // For Spark Session and Row
import scala.util.{Random}
import org.apache.spark.sql.types._
val generateUserData:()=>Unit=()=>{
    
    val numRecords = 10000
    val random = new Random()

    // Generate user data (userId and Name)
    val data = (1 to numRecords).map { i =>
      val userId = i
      val name = s"User-${i}"
      Row(userId, name)
    }
    
    
   val schema = StructType(Array(
      StructField("userId", IntegerType, nullable = false),
      StructField("Name", StringType, nullable = false)
    ))
    val rdd = sparkSession.sparkContext.parallelize(data)
    val df = sparkSession.createDataFrame(rdd, schema)

    // Show a few records
    df.show(10)

    // Set GCS bucket path
    val gcsPath = "gs://second_task/user_data.csv"  // Change this to your GCS path

    // Save DataFrame to GCS as CSV (you can also save as Parquet, JSON, etc.)
    df.write
      .format("csv")
      .option("header", "true")
      .mode("overwrite")  // Options: "overwrite", "append"
      .save(gcsPath)

    println(s"Dataset saved to GCS path: $gcsPath")

    // Stop the Spark session
//     sparkSession.stop()
}

generateUserData = > Unit = $Lambda$5007/0x0000000101dff040@35cca140


> Unit = $Lambda$5007/0x0000000101dff040@35cca140

In [16]:
generateUserData()

+------+-------+
|userId|   Name|
+------+-------+
|     1| User-1|
|     2| User-2|
|     3| User-3|
|     4| User-4|
|     5| User-5|
|     6| User-6|
|     7| User-7|
|     8| User-8|
|     9| User-9|
|    10|User-10|
+------+-------+
only showing top 10 rows

Dataset saved to GCS path: gs://second_task/user_data.csv


In [18]:
val generateUserTransactionData:()=>Unit=()=>{
    
    val numRecords=10000000
    
//     val data = (1 to numRecords).map { i =>
//       val transactionId=i
//       val userId =  random.nextInt(10000) + 1
//       val transaction = s"transaction-${i}"
//       Row(transactionId,userId, transaction)
//     }
    val random=new Random()
    
   val schema = StructType(Array(
       StructField("transactionId",IntegerType,nullable=false),
      StructField("userId", IntegerType, nullable = false),
      StructField("transaction", StringType, nullable = false)
    ))
//     val rdd = sparkSession.sparkContext.parallelize(data)
    //directly parallelizing the data creation
    val dataRDD = sparkSession.sparkContext.parallelize(1 to numRecords, numSlices = 100).map { i =>
        val transactionId = i
        val userId = random.nextInt(10000) + 1  // Random userId between 1 and 10000
        val transaction = s"transaction-${i}"   // Transaction string
        Row(transactionId, userId, transaction)
  }

    val df = sparkSession.createDataFrame(dataRDD, schema)

    // Show a few records
    df.show(10)

    // Set GCS bucket path
    val gcsPath = "gs://second_task/transaction_data.csv"  // Change this to your GCS path

    // Save DataFrame to GCS as CSV (you can also save as Parquet, JSON, etc.)
    df.write
      .format("csv")
      .option("header", "true")
      .mode("overwrite")  // Options: "overwrite", "append"
      .save(gcsPath)

    println(s"Dataset saved to GCS path: $gcsPath")
    
}

generateUserTransactionData = > Unit = $Lambda$5277/0x0000000100288840@6dae486b


> Unit = $Lambda$5277/0x0000000100288840@6dae486b

In [19]:
generateUserTransactionData()

+-------------+------+--------------+
|transactionId|userId|   transaction|
+-------------+------+--------------+
|            1|  5871| transaction-1|
|            2|  7933| transaction-2|
|            3|   719| transaction-3|
|            4|  6511| transaction-4|
|            5|  6172| transaction-5|
|            6|  8141| transaction-6|
|            7|  4128| transaction-7|
|            8|  3657| transaction-8|
|            9|  1008| transaction-9|
|           10|  5235|transaction-10|
+-------------+------+--------------+
only showing top 10 rows

Dataset saved to GCS path: gs://second_task/transaction_data.csv


In [20]:
val userDataCSVPath="gs://second_task/user_data.csv"
val userDataDf=sparkSession.read.option("header","true").option("inferSchema","true").csv(userDataCSVPath)

userDataCSVPath = gs://second_task/user_data.csv
userDataDf = [userId: int, Name: string]


[userId: int, Name: string]

In [21]:
val transactionDataCSVPath="gs://second_task/transaction_data.csv"
val transactionDataDf=sparkSession.read.option("header","true").option("inferSchema","true").csv(transactionDataCSVPath)

transactionDataCSVPath = gs://second_task/transaction_data.csv
transactionDataDf = [transactionId: int, userId: int ... 1 more field]


[transactionId: int, userId: int ... 1 more field]

In [23]:
//spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "50MB")  // Example to set threshold to 50MB
//can set the broadCasting threshold like this
import org.apache.spark.sql.functions._  // Importing the broadcast function
val broadcastedUserDataDf = broadcast(userDataDf)
val joinedDf=transactionDataDf.join(broadcastedUserDataDf,"userId")
val gcsJoinPath = "gs://second_task/joined_transaction_data_broadcasted.csv"
joinedDf.write
  .format("csv")
  .option("header", "true")
  .mode("overwrite")
  .save(gcsJoinPath)

gs://second_task/joined_transaction_data_broadcasted.csv

broadcastedUserDataDf = [userId: int, Name: string]
joinedDf = [userId: int, transactionId: int ... 2 more fields]
gcsJoinPath = gs://second_task/joined_transaction_data_broadcasted.csv


### Benefits of Broadcasting in This Case

- **Improved Performance:**
  - Broadcasting allows Spark to avoid shuffling the larger DataFrame (`transactionDataDf` in this case) during the join operation.
  - This reduces the amount of data exchanged between worker nodes.
  - The reduction in data transfer speeds up the join process significantly.

- **Cost-Effective:**
  - Broadcasting a small DataFrame ensures it is sent to all nodes only once.
  - This makes the process more efficient in terms of:
    - **Memory usage**: The DataFrame is stored in memory only once on each node.
    - **Computation time**: Avoids repeated reads of the small DataFrame, especially when accessed multiple times.
