# RDD Operations

## Transformations

### filter

In [2]:
a = sc.parallelize(range(1,10))

In [3]:
b = a.filter(lambda x: x%2 == 0)

In [4]:
b.collect()

[2, 4, 6, 8]

### map

In [6]:
a = sc.parallelize(["dog", "salmon", "salmon", "rat", "elephant"]) 
b = a.map(lambda x:len(x))
c = a.zip(b)
c.collect()

[('dog', 3), ('salmon', 6), ('salmon', 6), ('rat', 3), ('elephant', 8)]

In [8]:
### distinct

In [10]:
c = sc.parallelize(["Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"],2)
c.distinct().collect()

['Rat', 'Gnu', 'Dog', 'Cat']

In [11]:
### cartesian

In [12]:
x = sc.parallelize([1,2,3,4,5])
y = sc.parallelize([6,7,8,9,10])
x.cartesian(y).collect()

[(1, 6),
 (1, 7),
 (2, 6),
 (2, 7),
 (1, 8),
 (1, 9),
 (2, 8),
 (2, 9),
 (1, 10),
 (2, 10),
 (3, 6),
 (3, 7),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (3, 8),
 (3, 9),
 (4, 8),
 (4, 9),
 (3, 10),
 (4, 10),
 (5, 8),
 (5, 9),
 (5, 10)]

In [None]:
### coalesce
- Decrease the number of partitions in the RDD to numPartitions.
- Useful for running operations more efficiently after filtering down a large dataset

In [16]:
y = sc.parallelize(range(1,10), 10)
z = y.coalesce(2, False) 
z.glom().collect()
z.getNumPartitions()

2

In [18]:
### flatMap

In [19]:
a = sc.parallelize(range(1,10), 5) 
a.flatMap(lambda x: range(1, x)).collect()

[1,
 1,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 5,
 1,
 2,
 3,
 4,
 5,
 6,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8]

In [21]:
b = sc.parallelize([1, 2, 3], 2).flatMap(lambda x: [x, x, x])
b.collect()

[1, 1, 1, 2, 2, 2, 3, 3, 3]

In [22]:
lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line: line.split(" "))
words.first()

'hello'

In [None]:
### groupBy

In [23]:
rdd = sc.parallelize([1, 1, 2, 3, 5, 8]) 
result = rdd.groupBy(lambda x: x % 2).collect() 
sorted([(x, sorted(y)) for (x, y) in result])

[(0, [2, 8]), (1, [1, 1, 3, 5])]

In [24]:
### keys

In [25]:
a = sc.parallelize(["dog", "tiger", "lion", "cat", "panther", "eagle"], 2) 
b = a.map(lambda x : (len(x), x))
b.keys().collect()

[3, 5, 4, 3, 7, 5]

In [26]:
### union

In [27]:
seta = sc.parallelize(range(1,10)) 
setb = sc.parallelize(range(5 ,15))
(seta.union(setb)).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [None]:
### Zip

In [28]:
a = sc.parallelize(range(1 , 100), 3) 
b = sc.parallelize(range(101 , 200), 3) 
a.zip(b).collect()

[(1, 101),
 (2, 102),
 (3, 103),
 (4, 104),
 (5, 105),
 (6, 106),
 (7, 107),
 (8, 108),
 (9, 109),
 (10, 110),
 (11, 111),
 (12, 112),
 (13, 113),
 (14, 114),
 (15, 115),
 (16, 116),
 (17, 117),
 (18, 118),
 (19, 119),
 (20, 120),
 (21, 121),
 (22, 122),
 (23, 123),
 (24, 124),
 (25, 125),
 (26, 126),
 (27, 127),
 (28, 128),
 (29, 129),
 (30, 130),
 (31, 131),
 (32, 132),
 (33, 133),
 (34, 134),
 (35, 135),
 (36, 136),
 (37, 137),
 (38, 138),
 (39, 139),
 (40, 140),
 (41, 141),
 (42, 142),
 (43, 143),
 (44, 144),
 (45, 145),
 (46, 146),
 (47, 147),
 (48, 148),
 (49, 149),
 (50, 150),
 (51, 151),
 (52, 152),
 (53, 153),
 (54, 154),
 (55, 155),
 (56, 156),
 (57, 157),
 (58, 158),
 (59, 159),
 (60, 160),
 (61, 161),
 (62, 162),
 (63, 163),
 (64, 164),
 (65, 165),
 (66, 166),
 (67, 167),
 (68, 168),
 (69, 169),
 (70, 170),
 (71, 171),
 (72, 172),
 (73, 173),
 (74, 174),
 (75, 175),
 (76, 176),
 (77, 177),
 (78, 178),
 (79, 179),
 (80, 180),
 (81, 181),
 (82, 182),
 (83, 183),
 (84, 184),
 