In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setMaster('spark://spark-master:7077').setAppName('Actions')
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/22 19:30:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Create An RDD

In [4]:

file = sc.textFile('/opt/bitnami/spark/data/u.data')


# For numerical lists we can use .parrallelize()¶

# ie. sc.parallelize([1,2,3,4,5])


### Apply actions to RDD

#### First 	
get first element

In [5]:
file.first()

                                                                                

'196\t242\t3\t881250949'

#### Take 	
get top n elements

In [6]:
file.take(3)

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

#### Collect
Return all the elements of the dataset as an array at the driver program. 

This is usually useful after a filter or other operation that returns a sufficiently small subset of the data.

In [7]:
file.collect()

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596',
 '298\t474\t4\t884182806',
 '115\t265\t2\t881171488',
 '253\t465\t5\t891628467',
 '305\t451\t3\t886324817',
 '6\t86\t3\t883603013',
 '62\t257\t2\t879372434',
 '286\t1014\t5\t879781125',
 '200\t222\t5\t876042340',
 '210\t40\t3\t891035994',
 '224\t29\t3\t888104457',
 '303\t785\t3\t879485318',
 '122\t387\t5\t879270459',
 '194\t274\t2\t879539794',
 '291\t1042\t4\t874834944',
 '234\t1184\t2\t892079237',
 '119\t392\t4\t886176814',
 '167\t486\t4\t892738452',
 '299\t144\t4\t877881320',
 '291\t118\t2\t874833878',
 '308\t1\t4\t887736532',
 '95\t546\t2\t879196566',
 '38\t95\t5\t892430094',
 '102\t768\t2\t883748450',
 '63\t277\t4\t875747401',
 '160\t234\t5\t876861185',
 '50\t246\t3\t877052329',
 '301\t98\t4\t882075827',
 '225\t193\t4\t879539727',
 '290\t88\t4\t880731963',
 '97\t194\t3\t884238860',
 '157\t274\t4\t886890835',
 '181\t1081\t1\t878962623',
 '278\t603\t5\t

#### Count
Return the number of elements in the dataset.

In [8]:
file.count()

100000

#### sum

In [9]:
ratings_int = file.map(lambda s: int(s.split()[2]))
ratings_int.sum()

352986

#### mean

In [10]:
ratings_int.mean()

                                                                                

3.5298600000000024

#### Reduce
Aggregate the elements of the dataset using a function func (which takes two arguments and returns one). 

The function should be commutative and associative so that it can be computed correctly in parallel. 

In [11]:
ratings = file.map(lambda s: s.split()[2])
ratings.reduce(lambda a,b: a+b)

                                                                                

'331214253325533352424442425245344434151444235555534245524554314345155443134243355353545344444525434435434545543554444334434552553333433355441422242522433434434434535232444432441543143244344545334433243444342325523444355254445341454535143222444514243445333443333552245432414534245441143341553451535442352133544533353455234554255533413445344233133345325441345325443434355343533342423442333343552344454353535321143233443553442523344432435524545534244144352353443345213242255244441544342434343433455345454335442434425334344232443223113144243434113553344423341434452433352433243233443524313444452433314545535223433141544115544543114255444435545544143444433352523435243432125455413445425454544534345445544442453443343452534455432553432324444352335534545331441234533335345543433555552144252414314443153551543344514424425114425445234553244354431434355534435554543444553531254143534312324314545435542544253553235532414331343531444415544352324225533344324453534543334445355441323554311445344331241343423343534

#### TakeSample
Return an array with a random sample of num elements of the dataset, with or without replacement, optionally pre-specifying a random number generator seed.

In [12]:
file.takeSample(withReplacement=True, num=10)

                                                                                

['390\t989\t5\t879693677',
 '643\t195\t5\t891447063',
 '15\t248\t1\t879455871',
 '263\t527\t5\t891299148',
 '393\t40\t1\t889729185',
 '392\t99\t5\t891038433',
 '715\t227\t3\t875964272',
 '875\t527\t4\t876465230',
 '7\t86\t4\t891350810',
 '354\t199\t4\t891217130']

#### TakeOrdered
Return the first n elements of the RDD using either their natural order or a custom comparator.

In [13]:
file.takeOrdered(5)

['1\t1\t5\t874965758',
 '1\t10\t3\t875693118',
 '1\t100\t5\t878543541',
 '1\t101\t2\t878542845',
 '1\t102\t2\t889751736']

#### saveAsTextFile
Write the elements of the dataset as a text file (or set of text files) in a given directory in the local filesystem, HDFS or any other Hadoop-supported file system. 

Spark will call toString on each element to convert it to a line of text in the file. 

In [15]:
ratings.saveAsTextFile('hello1.csv')

                                                                                

#### CountByKey
Only available on RDDs of type (K, V). Returns a hashmap of (K, Int) pairs with the count of each key. 

In [16]:
ratings_key = ratings.map(lambda x: (x,1))
ratings_key.countByKey()

                                                                                

defaultdict(int, {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201})

#### foreach(func) 
Run a function func on each element of the dataset. 

This is usually done for side effects such as updating an Accumulator or interacting with external storage systems.

Note: modifying variables other than Accumulators outside of the foreach() may result in undefined behavior. See Understanding closures for more details.