<a href="https://colab.research.google.com/github/saktiworkstation/road-to-ai-developer/blob/main/05_pair_rdd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pair RDDs in PySpark

In [9]:
!pip install pyspark



In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### ReduceBkey and collect

In [22]:

Rdd = sc.parallelize([(1,20),(3,4),(3,6),(4,5)])

Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x+y)

# Tampilkan output menggunakan iterasi
for num in Rdd_Reduced.collect():
    print("Key {} has {} Counts".format(num[0], num[1]))

Key 4 has 5 Counts
Key 1 has 20 Counts
Key 3 has 10 Counts


In [12]:
Rdd_Reduced.collect()

[(4, 5), (1, 2), (3, 10)]

### SortByKey and Collect

In [13]:
# Urutkan reduced RDD
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=True)

for num in Rdd_Reduced_Sort.collect():
  print("Key {} has {} Counts".format(num[0], num[1]))

Key 1 has 2 Counts
Key 3 has 10 Counts
Key 4 has 5 Counts


### CountingBykeys

In [14]:
# Hitung unique key
total = Rdd.countByKey()

print("The type of total is", type(total))

for k, v in total.items():
    print("key", k, "has", v, "counts")

The type of total is <class 'collections.defaultdict'>
key 1 has 1 counts
key 3 has 2 counts
key 4 has 1 counts


### Create a base RDD and transform it

In [15]:
file_path = '/content/Complete_Shakespeare.txt'

In [16]:
baseRDD = sc.textFile(file_path)

splitRDD = baseRDD.flatMap(lambda x: x.split())

print("Total number of words in splitRDD:", splitRDD.count())

Total number of words in splitRDD: 128576


### Remove stop words and reduce the dataset

In [18]:
import nltk
from nltk.corpus import stopwords

# Download the 'stopwords' dataset
nltk.download('stopwords')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
# Ubah ke lowercase dan abaikan stop_words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Buat tuple dari setiap kata
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

### Print word frequencies

In [20]:
for word in resultRDD.take(10):
	print(word)

# Tukar key dan value
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

('Project', 9)
('EBook', 1)
('Shakespeare', 12)
('use', 38)
('anyone', 1)
('anywhere', 1)
('restrictions', 1)
('whatsoever.', 1)
('may', 162)
('it,', 74)


In [21]:
for word in resultRDD_swap_sort.take(10):
	print("{},{}". format(word[1], word[0]))

thou,650
thy,574
shall,393
would,311
good,295
thee,286
love,273
Enter,269
th',254
make,225
