<a href="https://colab.research.google.com/github/kareemullah123456789/bigdatafoundation-july8-new/blob/main/RDD_Sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install pyspark
!pip install pyspark --quiet

In [None]:
#jdk
import os
os.environ["JAVA_HOME"] = '/lib/jvm/java-11-openjdk-amd64'

In [1]:
# Install PySpark and required dependencies
!pip install pyspark findspark

# Import necessary libraries
import findspark
findspark.init()

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
from pyspark.sql import SparkSession

In [12]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().set('spark.ui.port','4050').setAppName('films').setMaster('local[2]')
sc = SparkContext.getOrCreate(conf=conf)

In [13]:
movie_list = ['RRR','Pushpa','Linga','Brahmayugam','love AajKal']
type(movie_list)

list

In [14]:
movie_rdd = sc.parallelize(movie_list)
type(movie_rdd)

In [15]:
movie_rdd

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:289

In [None]:
movie_rdd.collect()

['RRR', 'Pushpa', 'Linga', 'Brahmayugam', 'love AajKal']

In [None]:
type(movie_rdd.collect())

list

In [None]:
movie_rdd.glom().collect()   #glom() transforms each partition of your RDD into a list.
#It groups the elements within each partition into sublists, so you can inspect how data is distributed.

[['RRR', 'Pushpa'], ['Linga', 'Brahmayugam', 'love AajKal']]

In [None]:
conf = SparkConf().set('spark.ui.port','4051').setAppName('test1').setMaster('local[8]')
sc = SparkContext.getOrCreate(conf=conf)
#rdd1 = sc.parallelize([2,3,4,5,6,7,11])
rdd1 = sc.parallelize([2,3,4,5,6,7,11],8)
rdd1.glom().collect()

[[], [2], [3], [4], [5], [6], [7], [11]]

In [None]:
rdd1.stats()

(count: 7, mean: 5.428571428571429, stdev: 2.770102775666474, max: 11.0, min: 2.0)

In [None]:
rdd1.getNumPartitions()

8

In [None]:
rdd1.coalesce(2).glom().collect()    #coalesce(n) is used to reduce the number of partitions in an RDD or DataFrame.
#No shuffling involved
#Considered a narrow transformation. Each input partition contributes to only one output partition. Quick, efficient

[[2, 3, 4], [5, 6, 7, 11]]

In [None]:
#Repartition is used when partitions need to be increased or decreased. Here shuffling is involved.
#Wide transformation. Each input partition contributes to multiple output partitions.
rdd1.repartition(12).glom().collect()

[[4], [], [5, 6], [7], [2], [3, 11], [], [], [], [], [], []]

In [None]:
movie_rdd.take(2)

['RRR', 'Pushpa']

In [None]:
movie_list[0].title()

'Rrr'

In [None]:
movie_rdd.collect()[0].title()

'Rrr'

In [None]:
movie_rdd.collect()[-3:]

['Linga', 'Brahmayugam', 'love AajKal']

In [None]:
transform = lambda i:i.title()
movie_title_rdd = movie_rdd.map(transform)
movie_title_rdd.collect()

['Rrr', 'Pushpa', 'Linga', 'Brahmayugam', 'Love Aajkal']

In [None]:
transform1 = lambda i: i[0]=='l'
movie_rdd.filter(transform1).collect()

['love AajKal']

In [None]:
# Actions
# Count the number of elements in the RDD
num_movies = movie_rdd.count()
print(f"Number of movies: {num_movies}")
#print('Number of movies:', num_movies)

# Find the first element in the RDD
first_movie = movie_rdd.first()
print(f"First movie: {first_movie}")

# Reduce the RDD to a single value (e.g., concatenate all movie titles)
concatenated_titles = movie_rdd.reduce(lambda x, y: x + ", " + y)
print(f"Concatenated movie titles: {concatenated_titles}")


# Transformations
# Map each movie title to its length
movie_lengths = movie_rdd.map(lambda x: len(x))
print("Movie lengths:", movie_lengths.collect())


# Filter movies with titles longer than 5 characters
long_movie_titles = movie_rdd.filter(lambda x: len(x) > 5)
print("Long movie titles:", long_movie_titles.collect())

# FlatMap: Explode a list into individual elements
movie_chars_rdd = movie_rdd.flatMap(lambda x : list(x))
print("Characters:",movie_chars_rdd.collect())

# Distinct elements
print("Distinct movie characters:",movie_chars_rdd.distinct().collect())

# GroupByKey: Group the elements based on a key
# We need key-value pairs for this transformation.
# Let's create a new RDD
movie_with_length = movie_rdd.map(lambda x: (x, len(x)))
grouped_movies = movie_with_length.groupByKey().mapValues(list).collect()
print("Movies grouped by length:", grouped_movies)

# SortByKey
sorted_movies = movie_with_length.sortByKey().collect()
print("Movies sorted by Title:", sorted_movies)

Number of movies: 5
First movie: RRR
Concatenated movie titles: RRR, Pushpa, Linga, Brahmayugam, love AajKal
Movie lengths: [3, 6, 5, 11, 11]
Long movie titles: ['Pushpa', 'Brahmayugam', 'love AajKal']
Characters: ['R', 'R', 'R', 'P', 'u', 's', 'h', 'p', 'a', 'L', 'i', 'n', 'g', 'a', 'B', 'r', 'a', 'h', 'm', 'a', 'y', 'u', 'g', 'a', 'm', 'l', 'o', 'v', 'e', ' ', 'A', 'a', 'j', 'K', 'a', 'l']
Distinct movie characters: ['R', 'p', 'L', 'g', 'l', 'o', ' ', 'A', 'j', 'K', 'P', 'u', 's', 'h', 'a', 'i', 'n', 'B', 'r', 'm', 'y', 'v', 'e']
Movies grouped by length: [('Pushpa', [6]), ('Linga', [5]), ('RRR', [3]), ('Brahmayugam', [11]), ('love AajKal', [11])]
Movies sorted by Title: [('Brahmayugam', 11), ('Linga', 5), ('Pushpa', 6), ('RRR', 3), ('love AajKal', 11)]


In [None]:
rdd1 = sc.parallelize([2,3,4,5,6,7,11])

In [None]:
rdd2 = rdd1.map(lambda x: x * 2)
rdd2.collect()

[4, 6, 8, 10, 12, 14, 22]

In [None]:
rdd2 = rdd1.filter(lambda x: x > 10)
rdd2.collect()

[11]

In [None]:
rdd3 = sc.parallelize([[2,3,4],[5,6,7],[11,12,13]])
rdd2 = rdd3.map(lambda iter: [x * 2 for x in iter])
rdd2.collect()

[[4, 6, 8], [10, 12, 14], [22, 24, 26]]

In [None]:
rdd3 = sc.parallelize([[2,3,4],[5,6,7],[11,12,13]])
rdd2 = rdd3.mapPartitions(lambda iter: [x * 2 for x in iter])
rdd2.collect()


[[2, 3, 4, 2, 3, 4], [5, 6, 7, 5, 6, 7], [11, 12, 13, 11, 12, 13]]

In [None]:
rdd2 = rdd1.union(rdd3)
rdd2.collect()

[2, 3, 4, 5, 6, 7, 11, [2, 3, 4], [5, 6, 7], [11, 12, 13]]

In [None]:
rdd3 = sc.parallelize([7,11,12,13])
rdd2 = rdd1.intersection(rdd3)
rdd2.collect()

[7, 11]

In [None]:
rdd3 = sc.parallelize([7,7,11,12,13])
rdd2 = rdd3.distinct()
rdd2.collect()

[12, 7, 11, 13]

In [None]:
rdd2 = rdd1.groupBy(lambda x: x % 2)
rdd2.collect()

[(0, <pyspark.resultiterable.ResultIterable at 0x7c45a03fcb90>),
 (1, <pyspark.resultiterable.ResultIterable at 0x7c45a15d4510>)]

In [None]:
rdd1.groupBy(lambda x: x % 2).mapValues(list).collect()

[(0, [2, 4, 6]), (1, [3, 5, 7, 11])]

In [None]:
rdd1.collect()

[2, 3, 4, 5, 6, 7, 11]

In [None]:
result = rdd1.count()
result

7

In [None]:
result = rdd1.top(3)
result

[11, 7, 6]