In [1]:
# Import SparkSession
from pyspark.sql import SparkSession

In [2]:
# Create SparkSession
spark = SparkSession.builder.master('local[1]').appName('Pyspark_Primer').getOrCreate()

In [4]:
nums = list(range(0, 1000001))
len(nums)

1000001

In [11]:
nums_rdd = spark.sparkContext.parallelize(nums)
nums_rdd

ParallelCollectionRDD[2] at readRDDFromFile at PythonRDD.scala:247

In [12]:
nums_rdd.take(5)

[0, 1, 2, 3, 4]

In [13]:
# square each elements
squared_nums_rdd = nums_rdd.map(lambda x: x ** 2)
squared_nums_rdd.take(5)

[0, 1, 4, 9, 16]

In [14]:
pairs = squared_nums_rdd.map(lambda x: (x, len(str(x))))
pairs.take(25)

[(0, 1),
 (1, 1),
 (4, 1),
 (9, 1),
 (16, 2),
 (25, 2),
 (36, 2),
 (49, 2),
 (64, 2),
 (81, 2),
 (100, 3),
 (121, 3),
 (144, 3),
 (169, 3),
 (196, 3),
 (225, 3),
 (256, 3),
 (289, 3),
 (324, 3),
 (361, 3),
 (400, 3),
 (441, 3),
 (484, 3),
 (529, 3),
 (576, 3)]

In [17]:
even_digit_pairs = pairs.filter(lambda x: (x[1] % 2) == 0)
even_digit_pairs.take(25)

[(16, 2),
 (25, 2),
 (36, 2),
 (49, 2),
 (64, 2),
 (81, 2),
 (1024, 4),
 (1089, 4),
 (1156, 4),
 (1225, 4),
 (1296, 4),
 (1369, 4),
 (1444, 4),
 (1521, 4),
 (1600, 4),
 (1681, 4),
 (1764, 4),
 (1849, 4),
 (1936, 4),
 (2025, 4),
 (2116, 4),
 (2209, 4),
 (2304, 4),
 (2401, 4),
 (2500, 4)]

In [19]:
flipped_pairs = even_digit_pairs.map(lambda x: (x[1], x[0]))
flipped_pairs.take(25)

[(2, 16),
 (2, 25),
 (2, 36),
 (2, 49),
 (2, 64),
 (2, 81),
 (4, 1024),
 (4, 1089),
 (4, 1156),
 (4, 1225),
 (4, 1296),
 (4, 1369),
 (4, 1444),
 (4, 1521),
 (4, 1600),
 (4, 1681),
 (4, 1764),
 (4, 1849),
 (4, 1936),
 (4, 2025),
 (4, 2116),
 (4, 2209),
 (4, 2304),
 (4, 2401),
 (4, 2500)]

In [25]:
grouped = flipped_pairs.groupByKey()
grouped.take(25)

[(2, <pyspark.resultiterable.ResultIterable at 0x23bd812b7c0>),
 (4, <pyspark.resultiterable.ResultIterable at 0x23bd812bcd0>),
 (6, <pyspark.resultiterable.ResultIterable at 0x23bd812b520>),
 (8, <pyspark.resultiterable.ResultIterable at 0x23bd812b910>),
 (10, <pyspark.resultiterable.ResultIterable at 0x23bd8528ac0>),
 (12, <pyspark.resultiterable.ResultIterable at 0x23bd85281f0>)]

In [26]:
grouped = grouped.map(lambda x: (x[0], list(x[1])))
grouped.take(2)

[(2, [16, 25, 36, 49, 64, 81]),
 (4,
  [1024,
   1089,
   1156,
   1225,
   1296,
   1369,
   1444,
   1521,
   1600,
   1681,
   1764,
   1849,
   1936,
   2025,
   2116,
   2209,
   2304,
   2401,
   2500,
   2601,
   2704,
   2809,
   2916,
   3025,
   3136,
   3249,
   3364,
   3481,
   3600,
   3721,
   3844,
   3969,
   4096,
   4225,
   4356,
   4489,
   4624,
   4761,
   4900,
   5041,
   5184,
   5329,
   5476,
   5625,
   5776,
   5929,
   6084,
   6241,
   6400,
   6561,
   6724,
   6889,
   7056,
   7225,
   7396,
   7569,
   7744,
   7921,
   8100,
   8281,
   8464,
   8649,
   8836,
   9025,
   9216,
   9409,
   9604,
   9801])]

In [30]:
averaged = grouped.map(lambda x: (x[0], sum(x[1]) / len(x[1])))
averaged.collect()

[(2, 45.166666666666664),
 (4, 4675.5),
 (6, 471838.0),
 (8, 47204941.666666664),
 (10, 4720705565.0),
 (12, 472075391214.1667)]