In [0]:
sc

## Parallelized Collections


In [0]:
import numpy as np

In [0]:
data = np.arange(1, 13)
distData = sc.parallelize(data, 3)

In [0]:
distData.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [0]:
distData.glom().collect()

[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]

## External Source

In [0]:
fileData = sc.textFile("/FileStore/rdd/basic/text01.csv")

In [0]:
fileData.first()

'Name,email'

In [0]:
rdd1 = fileData.map(lambda r : r.split(','))
rdd1.collect()

[['Name', 'email'],
 ['Ravi Goyal', 'ravi.goyal@contoso.com'],
 ['Ankit Ved', 'ankitv@gmail.com']]

### Read All CSV File from Directory

In [0]:
header = fileData.first()
header

'Name,email'

In [0]:
filesData = sc.textFile("/FileStore/rdd/basic/*.csv")
filesData.collect()

['Name,email',
 'Ravi Goyal,ravi.goyal@contoso.com',
 'Ankit Ved,ankitv@gmail.com',
 'Name,email',
 'Mohan Mahori,mohan34@gmail.com',
 'Atul Malhotra,atul.m@yahoo.com',
 'Ramesh Sharma,ramesh.s3@hotmail.com',
 'Name,email',
 'Mohit Panchal,mohit.p@gmail.com',
 'Name,email',
 'Rishab Sahu,rishabshau@gmail.com',
 'Yukti Badoniya,byukti@gmail.com',
 'Nikhil S,nikhils1@hotmail.com']

In [0]:
rdd2 = filesData.filter(lambda r: r != header).map(lambda r: r.split(","))
rdd2.collect()

[['Ravi Goyal', 'ravi.goyal@contoso.com'],
 ['Ankit Ved', 'ankitv@gmail.com'],
 ['Mohan Mahori', 'mohan34@gmail.com'],
 ['Atul Malhotra', 'atul.m@yahoo.com'],
 ['Ramesh Sharma', 'ramesh.s3@hotmail.com'],
 ['Mohit Panchal', 'mohit.p@gmail.com'],
 ['Rishab Sahu', 'rishabshau@gmail.com'],
 ['Yukti Badoniya', 'byukti@gmail.com'],
 ['Nikhil S', 'nikhils1@hotmail.com']]

### Filter and write the data

In [0]:
gmail_rdd = rdd2.filter(lambda x: "@gmail.com" in x[1])

In [0]:
gmail_rdd.collect()

[['Ankit Ved', 'ankitv@gmail.com'],
 ['Mohan Mahori', 'mohan34@gmail.com'],
 ['Mohit Panchal', 'mohit.p@gmail.com'],
 ['Rishab Sahu', 'rishabshau@gmail.com'],
 ['Yukti Badoniya', 'byukti@gmail.com']]

In [0]:
import os, shutil
import csv
import uuid

def save_rdd_as_csv(rdd, output_dir):
    
    # Ensure output directory exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    # Prepare header for the CSV
    header = ['Name', 'Email']

    # Write the RDD data into CSV files
    
    filename = os.path.join(output_dir, f"partition_{uuid.uuid4()}.csv")
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header
        writer.writerows(rdd.collect())  # Write the record

    print(f"Data saved to CSV files in {output_dir}")


In [0]:
output_dir = "/dbfs/FileStore/rdd/basic/gmail_rdd_output/"
save_rdd_as_csv(gmail_rdd, output_dir)

Data saved to CSV files in /dbfs/FileStore/rdd/basic/gmail_rdd_output/


In [0]:
# Verify output by collecting filtered RDD
rdd3 = sc.textFile("dbfs:/FileStore/rdd/basic/gmail_rdd_output.csv")
rdd3.collect()

['Name,Email',
 'Ankit Ved,ankitv@gmail.com',
 'Mohan Mahori,mohan34@gmail.com',
 'Mohit Panchal,mohit.p@gmail.com',
 'Rishab Sahu,rishabshau@gmail.com',
 'Yukti Badoniya,byukti@gmail.com']

In [0]:
new_header = rdd3.first()
last_3_records = rdd3.filter(lambda r: r != new_header).takeSample(False, 3)
last_3_records

['Ankit Ved,ankitv@gmail.com',
 'Mohit Panchal,mohit.p@gmail.com',
 'Yukti Badoniya,byukti@gmail.com']

## Convert RDD into Dataframe

**Method 1**

In [0]:
rdd2.collect()

[['Ravi Goyal', 'ravi.goyal@contoso.com'],
 ['Ankit Ved', 'ankitv@gmail.com'],
 ['Mohan Mahori', 'mohan34@gmail.com'],
 ['Atul Malhotra', 'atul.m@yahoo.com'],
 ['Ramesh Sharma', 'ramesh.s3@hotmail.com'],
 ['Mohit Panchal', 'mohit.p@gmail.com'],
 ['Rishab Sahu', 'rishabshau@gmail.com'],
 ['Yukti Badoniya', 'byukti@gmail.com'],
 ['Nikhil S', 'nikhils1@hotmail.com']]

In [0]:
rdd2.toDF(['Name', 'Email']).show()

+--------------+--------------------+
|          Name|               Email|
+--------------+--------------------+
|    Ravi Goyal|ravi.goyal@contos...|
|     Ankit Ved|    ankitv@gmail.com|
|  Mohan Mahori|   mohan34@gmail.com|
| Atul Malhotra|    atul.m@yahoo.com|
| Ramesh Sharma|ramesh.s3@hotmail...|
| Mohit Panchal|   mohit.p@gmail.com|
|   Rishab Sahu|rishabshau@gmail.com|
|Yukti Badoniya|    byukti@gmail.com|
|      Nikhil S|nikhils1@hotmail.com|
+--------------+--------------------+



**Method 2**

In [0]:
data = rdd3.filter(lambda r: r != new_header).map(lambda r: r.split(","))

In [0]:
spark.createDataFrame(data, ['Name', 'Email']).show()

+--------------+--------------------+
|          Name|               Email|
+--------------+--------------------+
|     Ankit Ved|    ankitv@gmail.com|
|  Mohan Mahori|   mohan34@gmail.com|
| Mohit Panchal|   mohit.p@gmail.com|
|   Rishab Sahu|rishabshau@gmail.com|
|Yukti Badoniya|    byukti@gmail.com|
+--------------+--------------------+



### Write RDD to csv file

In [0]:
rdd3.saveAsTextFile('/FileStore/rdd/basic/gmail_rdd_output.csv')

### Clean up data

In [0]:
# import os, shutil

# output_dir = "/dbfs/Filestore/rdd/basic/gmail_rdd_output"

# if os.path.exists(output_dir):
#     shutil.rmtree(output_dir)
#     print("Deleted")
# else:
#     print("The directory does not exist")