# **KDDCup Data Analytics with PySpark RDD: A structured case study**

## YouTube channel: Code with Kristi
## Tutor: Dr Sachin Saxena (PhD, MTech, BTech)

### data source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [0]:
# Initialize SparkContext - not required in Databricks, it's auto-initialized
# from pyspark import SparkContext
# sc = SparkContext.getOrCreate()

# DBFS file path (no need for /dbfs prefix inside Spark code)
file_path = "dbfs:/FileStore/kddcup_data.gz"

# Load data using SparkContext
rdd = sc.textFile(file_path)

# Check data
print(rdd.take(5))  # sample output
print("RDD successfully loaded from DBFS!")

['0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,1,1,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,2,2,1.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,3,3,1.00,0.00,0.33,0.00,0.00,0.00,0.00,0.00,normal.', '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.00,0.00,0.00,0.00,1.00,0.00,0.00,4,4,1.00,0.00,0.25,0.00,0.00,0.00,0.00,0.00,normal.']
RDD successfully loaded from DBFS!


# Repartition and Cache Data:

In [0]:
# How many partitions do we have?
# By default, the number of partitions is determined by the number of cores available
# in your local setup or cluster.
# If you are running it locally, it's often based on the number of CPU cores.
rdd.getNumPartitions()

1

In [0]:
type(rdd)

pyspark.core.rdd.RDD

In [0]:
rdd.glom().map(len).collect()

# glom(): Transforms each partition of the RDD into a list. Instead of working with individual elements, you now have a list of elements for each partition.

# map(len): Applies the len function to each partition (which is now a list) to get the count of elements in that partition.

# collect(): Collects the result back to the driver as a list, giving the count of elements in each partition.

[4898431]

In [0]:
# To check the contents of the RDD
# print(rdd.collect())

In [0]:
rdd =rdd.repartition(10)

# Can increase or decrease the level of parallelism in this RDD.
# Internally, this uses a shuffle to redistribute data.
# If you are decreasing the number of partitions in this RDD, consider using coalesce,
#  which can avoid performing a shuffle.


In [0]:
rdd.glom().map(len).collect()

[489850,
 489850,
 489841,
 489840,
 489840,
 489840,
 489840,
 489840,
 489840,
 489850]

In [0]:
print(sc.defaultParallelism)
print(rdd.getNumPartitions())

rdd.persist()
# 2 cores and 10 partitions, 5 partitions in each core

2
10


MapPartitionsRDD[12] at coalesce at NativeMethodAccessorImpl.java:0

# Custom dataset

In [0]:
# your list of data
data = [('Siva',30), ('Sachin',25),('Manish',41),('Lavya',47),('Varun',72)]



In [0]:
type(data)

list

In [0]:
# Convert the list into an RDD
rdd = sc.parallelize(data)

In [0]:
# To check the contents of the RDD
print(rdd.collect())

[('Siva', 30), ('Sachin', 25), ('Manish', 41), ('Lavya', 47), ('Varun', 72)]


In [0]:
type(rdd)

In [0]:
rdd.glom().map(len).collect()

# [number of partitions, elements in each list]
# For example, if the RDD is divided into 2 partitions like this:

# Partition 1: [('Lavya', 47), ('Varun', 72)]
# Partition 2: [('Siva', 30), ('Sachin', 25), ('Manish', 41)]

[2, 3]

In [0]:
# Create RDD with a specific number of partitions (e.g., 5 partitions)

rdd = sc.parallelize(data, 5)



In [0]:
# Check the number of partitions again

num_partitions = rdd.getNumPartitions()

In [0]:
num_partitions

5

In [0]:
rdd.glom().map(len).collect()

[1, 1, 1, 1, 1]

In [0]:
print(sc.defaultParallelism)

2


In [0]:
print(rdd.getNumPartitions())

5


In [0]:
rdd.persist()
# Set this RDD’s storage level to persist its values across operations after
# the first time it is computed. This can only be used to
# assign a new storage level if the RDD does not have a storage level set yet.

ParallelCollectionRDD[8] at readRDDFromFile at PythonRDD.scala:289