# Resilient Distributed Dataset (RDD)

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("ShobhitApp") \
.getOrCreate()

In [2]:
type(spark)

pyspark.sql.session.SparkSession

#### Create RDD using sparkContext.parallelize()

In [3]:
data = ((1, 2, 3),(4, 5, 6),(7, 8, 9))

rdd = spark.sparkContext.parallelize(data)

In [4]:
type(rdd)

pyspark.rdd.RDD

#### Create RDD using sparkContext.textFile()
Using this method we can read a file into RDD. Compressed/Uncompressed, directories, or glob patterns (file patterns with wildcards) also.
syntax: sc.textFile(name, minPartitions=None, use_unicode=True)

In [5]:
rdd = spark.sparkContext.textFile("/user/itv000197/tab_data.txt")

In [6]:
type(rdd)

pyspark.rdd.RDD

#### Create RDD using sparkContext.wholeTextFiles()
The method allow you to read a directory containing multiple files. Each file is represented as a record consisting of a key containing the filename and a value containing the contents of the file.

In [7]:
rdd = spark.sparkContext.wholeTextFiles("/user/itv000197/inputfile/")

In [8]:
rdd.collect()

[('hdfs://m01.itversity.com:9000/user/itv000197/inputfile/part-00000',
  'Salman Rushide,Grimus:Shame:Fury\nThomas Otway,Don Carlos:The Orphan\n'),
 ('hdfs://m01.itversity.com:9000/user/itv000197/inputfile/part-00001',
  'Ben Jonson,Volpone:Epicene\nJohn Milton,Arcades:Comus\n')]

#### Get Partition Details

In [9]:
rdd.getNumPartitions()

2

#### Actions:

| Syntax      | Description | Return Type    |
| :---        |    :----   |          ---: |
| .collect()      | Action on an RDD returns a list of all the elements of the RDD       | List   |
| .count()   | action on an RDD is an operation that returns the number of elements of our RDD        |      |
| .first() |action on an RDD returns the first element from our RDD||
|.take(n)|action on an RDD returns n number of elements from the RDD||
|.saveAsTextFile()|Action is used to serve the resultant RDD as a text file||

In [10]:
rdd = spark.sparkContext.textFile("/user/itv000197/authors.csv")

In [11]:
rdd.collect()

['Salman Rushide,Grimus:Shame:Fury',
 'Thomas Otway,Don Carlos:The Orphan',
 'Ben Jonson,Volpone:Epicene',
 'John Milton,Arcades:Comus']

In [12]:
rdd.take(1)

['Salman Rushide,Grimus:Shame:Fury']

In [13]:
rdd.first()

'Salman Rushide,Grimus:Shame:Fury'

In [14]:
rdd.count()

4

In [15]:
rdd.saveAsTextFile("/user/itv000197/output")

In [16]:
 %%sh
hdfs dfs -ls output/

Found 3 items
-rw-r--r--   3 itv000197 supergroup          0 2022-02-26 07:32 output/_SUCCESS
-rw-r--r--   3 itv000197 supergroup         68 2022-02-26 07:32 output/part-00000
-rw-r--r--   3 itv000197 supergroup         53 2022-02-26 07:32 output/part-00001


#### Create Empy RDDs

In [17]:
emptyRdd = spark.sparkContext.emptyRDD() 

In [18]:
type(emptyRdd)

pyspark.rdd.RDD

In [19]:
emptyRdd.count()

0