In [0]:
%python
#https://sparkbyexamples.com/apache-spark-rdd/spark-rdd-transformations/
#https://sparkbyexamples.com/pyspark/pyspark-rdd-transformations/

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Rdd_trans').getOrCreate()
rdd = spark.sparkContext.textFile('dbfs:/FileStore/shared_uploads/AKSHATAKHEDEKAR01032001@rjcollege.edu.in/text-3.txt')

In [0]:
for element in rdd.collect():
    print(element)

Spark revolves around the concept of a resilient distributed dataset (RDD), 
which is a fault-tolerant collection of elements that can be operated on in parallel. 
There are two ways to create RDDs: parallelizing an existing collection in your driver program, 
or referencing a dataset in an external storage system, such as a shared filesystem, HDFS, HBase, 
or any data source offering a Hadoop InputFormat.


In [0]:
#Flatmap
rdd2 = rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
    print(element)


Spark
revolves
around
the
concept
of
a
resilient
distributed
dataset
(RDD),

which
is
a
fault-tolerant
collection
of
elements
that
can
be
operated
on
in
parallel.

There
are
two
ways
to
create
RDDs:
parallelizing
an
existing
collection
in
your
driver
program,

or
referencing
a
dataset
in
an
external
storage
system,
such
as
a
shared
filesystem,
HDFS,
HBase,

or
any
data
source
offering
a
Hadoop
InputFormat.


In [0]:
#Map
rdd3 = rdd2.map(lambda x: (x,1))
for element in rdd3.collect():
    print(element)

('Spark', 1)
('revolves', 1)
('around', 1)
('the', 1)
('concept', 1)
('of', 1)
('a', 1)
('resilient', 1)
('distributed', 1)
('dataset', 1)
('(RDD),', 1)
('', 1)
('which', 1)
('is', 1)
('a', 1)
('fault-tolerant', 1)
('collection', 1)
('of', 1)
('elements', 1)
('that', 1)
('can', 1)
('be', 1)
('operated', 1)
('on', 1)
('in', 1)
('parallel.', 1)
('', 1)
('There', 1)
('are', 1)
('two', 1)
('ways', 1)
('to', 1)
('create', 1)
('RDDs:', 1)
('parallelizing', 1)
('an', 1)
('existing', 1)
('collection', 1)
('in', 1)
('your', 1)
('driver', 1)
('program,', 1)
('', 1)
('or', 1)
('referencing', 1)
('a', 1)
('dataset', 1)
('in', 1)
('an', 1)
('external', 1)
('storage', 1)
('system,', 1)
('such', 1)
('as', 1)
('a', 1)
('shared', 1)
('filesystem,', 1)
('HDFS,', 1)
('HBase,', 1)
('', 1)
('or', 1)
('any', 1)
('data', 1)
('source', 1)
('offering', 1)
('a', 1)
('Hadoop', 1)
('InputFormat.', 1)


In [0]:
#reduceByKey
rdd4=rdd3.reduceByKey(lambda a,b: a+b)
for element in rdd4.collect():
    print(element)

('Spark', 1)
('around', 1)
('of', 2)
('', 4)
('is', 1)
('fault-tolerant', 1)
('collection', 2)
('operated', 1)
('in', 3)
('are', 1)
('two', 1)
('an', 2)
('driver', 1)
('program,', 1)
('external', 1)
('storage', 1)
('system,', 1)
('as', 1)
('filesystem,', 1)
('HBase,', 1)
('source', 1)
('offering', 1)
('InputFormat.', 1)
('revolves', 1)
('the', 1)
('concept', 1)
('a', 5)
('resilient', 1)
('distributed', 1)
('dataset', 2)
('(RDD),', 1)
('which', 1)
('elements', 1)
('that', 1)
('can', 1)
('be', 1)
('on', 1)
('parallel.', 1)
('There', 1)
('ways', 1)
('to', 1)
('create', 1)
('RDDs:', 1)
('parallelizing', 1)
('existing', 1)
('your', 1)
('or', 2)
('referencing', 1)
('such', 1)
('shared', 1)
('HDFS,', 1)
('any', 1)
('data', 1)
('Hadoop', 1)


In [0]:
#sortByKey
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
for element in rdd5.collect():
    print(element)

(1, 'Spark')
(1, 'around')
(1, 'is')
(1, 'fault-tolerant')
(1, 'operated')
(1, 'are')
(1, 'two')
(1, 'driver')
(1, 'program,')
(1, 'external')
(1, 'storage')
(1, 'system,')
(1, 'as')
(1, 'filesystem,')
(1, 'HBase,')
(1, 'source')
(1, 'offering')
(1, 'InputFormat.')
(1, 'revolves')
(1, 'the')
(1, 'concept')
(1, 'resilient')
(1, 'distributed')
(1, '(RDD),')
(1, 'which')
(1, 'elements')
(1, 'that')
(1, 'can')
(1, 'be')
(1, 'on')
(1, 'parallel.')
(1, 'There')
(1, 'ways')
(1, 'to')
(1, 'create')
(1, 'RDDs:')
(1, 'parallelizing')
(1, 'existing')
(1, 'your')
(1, 'referencing')
(1, 'such')
(1, 'shared')
(1, 'HDFS,')
(1, 'any')
(1, 'data')
(1, 'Hadoop')
(2, 'of')
(2, 'collection')
(2, 'an')
(2, 'dataset')
(2, 'or')
(3, 'in')
(4, '')
(5, 'a')


In [0]:
#filter
rdd6 = rdd5.filter(lambda x : 's' == x[1])  #in
for element in rdd6.collect():
    print(element)

(5, 'a')


In [0]:
stwords = ['a','an','the']
rdd3 = rdd2.filter(lambda x: x not in stwords)

for element in rdd3.collect():
    print(element)

Spark
revolves
around
concept
resilient
distributed
dataset
(RDD),

which
is
fault-tolerant
collection
elements
that
can
be
operated
on
in
parallel.

There
are
two
ways
to
create
RDDs:
parallelizing
existing
collection
in
your
driver
program,

or
referencing
dataset
in
external
storage
system,
such
as
shared
filesystem,
HDFS,
HBase,

or
any
data
source
offering
Hadoop
InputFormat.


In [0]:
rdd4 = rdd3.filter(lambda x: x in stwords)
for element in rdd4.collect():
    print(element)

In [0]:
ex = spark.sparkContext.textFile('dbfs:/FileStore/shared_uploads/AKSHATAKHEDEKAR01032001@rjcollege.edu.in/example_2-1.txt')
ex.collect()

Out[37]: ['IT_Sumit',
 'IT_Amit',
 'DSai_Suraj',
 'DSAI_Saurabh',
 'CS_Chaitnya',
 'CS_Namita']

In [0]:
#Get the list of students of DSAI
from pyspark.sql.functions import upper
ex_1 = ex.filter(lambda x: 'DSAI' in x.upper())
ex_1.collect()

Out[59]: ['DSai_Suraj', 'DSAI_Saurabh']

In [0]:
#Count the number of students of IT
ex_2 = ex.filter(lambda x: 'IT' in x)
ex_2.count()

Out[57]: 2

In [0]:
rdd = sc.parallelize(['IT_Sumit',
 'IT_Amit',
 'DSai_Suraj',
 'DSAI_Saurabh',
 'CS_Chaitnya',
 'CS_Namita'])

In [0]:
#Get the 10% sample of rdd data
rdd_sample1 = rdd.sample(False, .3, 2)

#Get the 20% sample of rdd data
rdd_sample2 = rdd.sample(False, .5, 3)


In [0]:

#Combine the data of 10% sample rdd and 20% sample rdd into the rdd named 'union rdd'
union_samples = rdd_sample1.union(rdd_sample2)
union_samples.collect()

Out[65]: ['DSAI_Saurabh', 'IT_Amit', 'DSAI_Saurabh']

In [0]:
#Get only distinct values from the union RDD into the rdd named 'distinctRDD'
distinctRDD = rdd.distinct()
distinctRDD.collect()

Out[55]: ['CS_Chaitnya',
 'DSai_Suraj',
 'CS_Namita',
 'IT_Sumit',
 'IT_Amit',
 'DSAI_Saurabh']

In [0]:
from pyspark.sql import Row
names = Row("Year","First_Name","Country","Sex","Count")
name1 = names(2012,"DOMINIC","CAGYUGA","M",6)
name2 = names(2012,"ADDISON","ONODAGA","F",14)
name3 = names(2012,"ADDISON","ONONDAGA","F",14)
name4 = names(2013,"JULIA","ONONDAGA","F",15)
name5 = names(2014,"MULIA","DAGA","M",10)

namesall = Row(names=[name1,name2,name3,name4,name5])

In [0]:
#Q. Get the list of first names from the above RDD using map function.
# for i in range(0, len(namesall.names)):
#     print(namesall.names[i].First_Name)
    
rdd354 = map(lambda x : x.names.First_Name,rdd);rdd354.collect()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-3227342258921575>[0m in [0;36m<module>[0;34m[0m
[1;32m      3[0m [0;31m#     print(namesall.names[i].First_Name)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m [0;34m[0m[0m
[0;32m----> 5[0;31m [0mrdd354[0m [0;34m=[0m [0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m [0;34m:[0m [0mx[0m[0;34m.[0m[0mnames[0m[0;34m.[0m[0mFirst_Name[0m[0;34m,[0m[0mrdd[0m[0;34m)[0m[0;34m;[0m[0mrdd354[0m[0;34m.[0m[0mcollect[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mTypeError[0m: 'RDD' object is not iterable

In [0]:
type(namesall)

Out[89]: pyspark.sql.types.Row

In [0]:
rdd= sc.parallelize(namesall)
rdd[0].collect()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-268490988253062>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mrdd[0m[0;34m=[0m [0msc[0m[0;34m.[0m[0mparallelize[0m[0;34m([0m[0mnamesall[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mrdd[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0mcollect[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mTypeError[0m: 'RDD' object is not subscriptable

In [0]:
#Q. Count the total number of female candidates.
rdd1 = rdd.map(lambda a:(a[1][1]))
rdd1.collect()

Out[114]: ['ADDISON']

In [0]:
#Q. Get the of distinct counties.


In [0]:
#Q. Get the distinct list of year data.