## Install Spark and create SparkSession and SparlCpntext instance

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"


!pip install -q findspark
import findspark

findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark DataFrame Example") \
    .getOrCreate()
sc = spark.sparkContext
sc

# RDD

In [17]:
import requests, os
path = "https://raw.githubusercontent.com/pankajcloudthat/deloitte-pyspark/refs/heads/main/pyspark/dataset/rdd_create/"


if not os.path.exists('pd_dataset'):
  os.mkdir("pd_dataset")
  os.mkdir("pd_dataset/rdd_create")


file_list = ['text01.csv','text02.csv','text03.csv','text04.csv']
for fname in file_list:
  req = requests.get(path + fname)
  csv_file_name = 'pd_dataset/rdd_create/' + fname
  with open(csv_file_name, 'wb') as cfile:
    cfile.write(req.content)

In [18]:
import numpy as np

In [19]:
data = np.arange(1, 13)
distData = sc.parallelize(data, 3)

In [20]:
distData.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [21]:
distData.glom().collect()

[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]

In [22]:
fileData = sc.textFile("/content/pd_dataset/rdd_create/text01.csv")

In [23]:
fileData.first()

'Name,email'

In [24]:
rdd1 = fileData.map(lambda r : r.split(','))
rdd1.first()

['Name', 'email']

In [25]:
rdd1.collect()

[['Name', 'email'],
 ['Ravi Goyal', 'ravi.goyal@contoso.com'],
 ['Ankit Ved', 'ankitv@gmail.com']]

In [26]:
header = fileData.first()
header

'Name,email'

In [27]:
filesData = sc.textFile("/content/pd_dataset/rdd_create/*.csv")
filesData.collect()

['Name,email',
 'Ravi Goyal,ravi.goyal@contoso.com',
 'Ankit Ved,ankitv@gmail.com',
 'Name,email',
 'Mohan Mahori,mohan34@gmail.com',
 'Atul Malhotra,atul.m@yahoo.com',
 'Ramesh Sharma,ramesh.s3@hotmail.com',
 'Name,email',
 'Mohit Panchal,mohit.p@gmail.com',
 'Name,email',
 'Rishab Sahu,rishabshau@gmail.com',
 'Yukti Badoniya,byukti@gmail.com',
 'Nikhil S,nikhils1@hotmail.com']

In [28]:
rdd2 = filesData.filter(lambda r: r != header).map(lambda r: r.split(","))
rdd2.collect()

[['Ravi Goyal', 'ravi.goyal@contoso.com'],
 ['Ankit Ved', 'ankitv@gmail.com'],
 ['Mohan Mahori', 'mohan34@gmail.com'],
 ['Atul Malhotra', 'atul.m@yahoo.com'],
 ['Ramesh Sharma', 'ramesh.s3@hotmail.com'],
 ['Mohit Panchal', 'mohit.p@gmail.com'],
 ['Rishab Sahu', 'rishabshau@gmail.com'],
 ['Yukti Badoniya', 'byukti@gmail.com'],
 ['Nikhil S', 'nikhils1@hotmail.com']]

In [29]:
gmail_rdd = rdd2.filter(lambda x: "@gmail.com" in x[1])

In [30]:
gmail_rdd.collect()

[['Ankit Ved', 'ankitv@gmail.com'],
 ['Mohan Mahori', 'mohan34@gmail.com'],
 ['Mohit Panchal', 'mohit.p@gmail.com'],
 ['Rishab Sahu', 'rishabshau@gmail.com'],
 ['Yukti Badoniya', 'byukti@gmail.com']]

In [31]:
import os, shutil
import csv
import uuid

def save_rdd_as_csv(rdd, output_dir):

    # Ensure output directory exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    # Prepare header for the CSV
    header = ['Name', 'Email']

    # Write the RDD data into CSV files

    filename = os.path.join(output_dir, f"partition_{uuid.uuid4()}.csv")
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header
        writer.writerows(rdd.collect())  # Write the record

    print(f"Data saved to CSV files in {output_dir}")

In [32]:
output_dir = "/content/data/gmail_rdd_output/"
save_rdd_as_csv(gmail_rdd, output_dir)

Data saved to CSV files in /content/data/gmail_rdd_output/


In [33]:
ol_rdd = rdd2.filter(lambda x: "@outlook.com" in x[1])
output_dir = "/content/data/ol_rdd_output/"
save_rdd_as_csv(ol_rdd, output_dir)

Data saved to CSV files in /content/data/ol_rdd_output/
