In [1]:
from pyspark import SparkConf, SparkContext

# COMMAND ----------

conf = SparkConf().setAppName("Read File")


In [15]:
sc = SparkContext.getOrCreate(conf=conf)

text = sc.textFile("FirstNumberFile.txt")

Print by collecting - not to do when data set is large


In [16]:
print('\n\n\n')
print(text.collect())
print('\n\n\n')





['1 2 3 4 5', '6 7 8 9 10', '11 12 13 14 15', '16 17 18 19 20']






Split the data

In [17]:
rdd2 = text.map(lambda x:x.split(' '))
print(rdd2.collect())

[['1', '2', '3', '4', '5'], ['6', '7', '8', '9', '10'], ['11', '12', '13', '14', '15'], ['16', '17', '18', '19', '20']]


Append the text


In [18]:
rdd3 = text.map(lambda x:(x + ' Test'))

In [19]:
print(rdd3.collect())

['1 2 3 4 5 Test', '6 7 8 9 10 Test', '11 12 13 14 15 Test', '16 17 18 19 20 Test']


Next Step - use flatmap to combine


In [20]:
flatMap_values = text.flatMap(lambda x:x.split(' '))
print(flatMap_values.collect())

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']


1st function use

In [21]:
def split(input):
  localData = input.split(' ')
  return localData

rddFunc = text.map(split)
print(rddFunc.collect())

[['1', '2', '3', '4', '5'], ['6', '7', '8', '9', '10'], ['11', '12', '13', '14', '15'], ['16', '17', '18', '19', '20']]


Use function to add a number

In [22]:
def splitAndAdd(input):
  localData = input.split(' ')
  l2 = []
  for i in localData:
      l2.append(int(i) + 3)
  return l2

rddFuncAdd = text.map(splitAndAdd)
print(rddFuncAdd.collect())

[[4, 5, 6, 7, 8], [9, 10, 11, 12, 13], [14, 15, 16, 17, 18], [19, 20, 21, 22, 23]]


Calculate the length of text in a file and output the length

In [23]:
stringInput = sc.textFile("StringInput.txt")

def countLength(input):
  l1 = input.split(' ')
  l2 = []
  for i in l1:
    l2.append(len(i))
  return l2

rddStringInput = stringInput.map(countLength)

print(rddStringInput.collect())

[[2, 3, 3, 4], [4, 3, 3, 5, 5], [6]]


Do same using lambda

In [24]:
rddStringLambda = stringInput.map(lambda x:[len(s) for s in x.split(' ')])
print(rddStringLambda.collect())

[[2, 3, 3, 4], [4, 3, 3, 5, 5], [6]]


Let's try to filter data

In [36]:
num = list(range(0,1000))
rddFilter = sc.parallelize(num)
print(rddFilter.take(20))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [39]:
numSq = rddFilter.map(lambda x:(x**2))
print(numSq.take(20))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361]


In [44]:
numSqCount = numSq.map(lambda x:(x,len(str(x))))
print(numSqCount.take(20))

[(0, 1), (1, 1), (4, 1), (9, 1), (16, 2), (25, 2), (36, 2), (49, 2), (64, 2), (81, 2), (100, 3), (121, 3), (144, 3), (169, 3), (196, 3), (225, 3), (256, 3), (289, 3), (324, 3), (361, 3)]


In [53]:
numSqCountFlip = numSqCount.map(lambda x:(x[1],x[0]))
print(numSqCountFlip.take(20))

[(1, 0), (1, 1), (1, 4), (1, 9), (2, 16), (2, 25), (2, 36), (2, 49), (2, 64), (2, 81), (3, 100), (3, 121), (3, 144), (3, 169), (3, 196), (3, 225), (3, 256), (3, 289), (3, 324), (3, 361)]


1. Group by Key
2. Map by and convert to list
3. Filter if not divisible by 2
4. print the 1st 3 elements

In [63]:
numSqCountGroup = numSqCountFlip.groupByKey().map(lambda x:(x[0],list(x[1]))).filter(lambda x:(x[0]%2 ==1))
print(numSqCountGroup.take(3))


[(1, [0, 1, 4, 9]), (3, [100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961]), (5, [10000, 10201, 10404, 10609, 10816, 11025, 11236, 11449, 11664, 11881, 12100, 12321, 12544, 12769, 12996, 13225, 13456, 13689, 13924, 14161, 14400, 14641, 14884, 15129, 15376, 15625, 15876, 16129, 16384, 16641, 16900, 17161, 17424, 17689, 17956, 18225, 18496, 18769, 19044, 19321, 19600, 19881, 20164, 20449, 20736, 21025, 21316, 21609, 21904, 22201, 22500, 22801, 23104, 23409, 23716, 24025, 24336, 24649, 24964, 25281, 25600, 25921, 26244, 26569, 26896, 27225, 27556, 27889, 28224, 28561, 28900, 29241, 29584, 29929, 30276, 30625, 30976, 31329, 31684, 32041, 32400, 32761, 33124, 33489, 33856, 34225, 34596, 34969, 35344, 35721, 36100, 36481, 36864, 37249, 37636, 38025, 38416, 38809, 39204, 39601, 40000, 40401, 40804, 41209, 41616, 42025, 42436, 42849, 43264, 43681, 44100, 44521, 44944, 45369, 45796, 46225, 46656, 47089, 47524, 47961, 48400, 48841, 4928