In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("Read File")
sc = SparkContext.getOrCreate(conf=conf)

## Map

In [2]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")

In [3]:
def count_vals(x):
    return [len(i) for i in x.split(' ')]

rdd2 = rdd.map(count_vals)

In [4]:
rdd2.collect()

[[4, 5, 7, 6], [3, 3, 3, 3, 6], [5, 6, 6, 2, 7, 5], [6, 3, 5, 3]]

## FlatMap

In [5]:
rdd3 = rdd.flatMap(count_vals)
rdd3.collect()

[4, 5, 7, 6, 3, 3, 3, 3, 6, 5, 6, 6, 2, 7, 5, 6, 3, 5, 3]

## Filter

In [6]:
rdd_filter = rdd.filter(lambda x: x )

## Quizz Filter

In [7]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd = rdd.flatMap(lambda x: x.split(' '))

In [8]:
def filter_words(text):
    for word in text:
        if word.startswith("a") or word.startswith("c"):
            return False
        else:
            return True

In [9]:
rdd_filtered = rdd.filter(filter_words)

In [10]:
rdd_filtered.collect()

['this', 'mango', 'dog', 'mic', 'laptop', 'switch', 'mobile']

## RDD Distinct

In [11]:
rdd = sc.textFile("data/sample_file.txt")

In [12]:
rdd2 = rdd.flatMap(lambda x: x.split(' '))

In [13]:
rdd3 = rdd2.distinct()

In [14]:
rdd3.collect()

['0',
 '1',
 '98',
 '',
 '8',
 '12',
 '9',
 '86',
 '786',
 '56',
 '66',
 '872',
 '27',
 '11',
 '5',
 '6',
 '567',
 '87',
 '678',
 '5675']

## RDD Functions - groupByKey

In [15]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd2 = rdd.flatMap(lambda x: x.split(' '))
rdd3 = rdd2.map(lambda x: (len(x), x))
rdd3.groupByKey().mapValues(list).collect()

[(4, ['this']),
 (6, ['animal', 'laptop', 'switch', 'mobile', 'amanda']),
 (2, ['am']),
 (5, ['mango', 'chair', 'cover', 'alarm']),
 (7, ['company', 'charger']),
 (3, ['cat', 'dog', 'ant', 'mic', 'any', 'ant'])]

## RDD Functions - reduceByKey

In [16]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd2 = rdd.flatMap(lambda x: x.split(' '))
rdd3 = rdd2.map(lambda x: (len(x), x))

rdd3.reduceByKey(lambda x, y: x + " " + y).collect()

[(4, 'this'),
 (6, 'animal laptop switch mobile amanda'),
 (2, 'am'),
 (5, 'mango chair cover alarm'),
 (7, 'company charger'),
 (3, 'cat dog ant mic any ant')]

In [17]:
rdd = sc.textFile("data/sample_file.txt")
rdd2 = rdd.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1))

rdd2.reduceByKey(lambda x, y: x + y).collect()

[('0', 1),
 ('1', 1),
 ('98', 1),
 ('', 5),
 ('8', 3),
 ('12', 1),
 ('9', 2),
 ('86', 1),
 ('786', 1),
 ('56', 2),
 ('66', 1),
 ('872', 1),
 ('27', 1),
 ('11', 1),
 ('5', 3),
 ('6', 2),
 ('567', 1),
 ('87', 1),
 ('678', 1),
 ('5675', 1)]

## Quizz - Word count

In [18]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd = rdd.flatMap(lambda x: x.split(' '))
rdd.map(lambda x: (x, 1)).filter(lambda x: len(x) != 0).reduceByKey(lambda x, y: x + y).collect()

[('this', 1),
 ('mango', 1),
 ('cat', 1),
 ('ant', 2),
 ('laptop', 1),
 ('chair', 1),
 ('switch', 1),
 ('mobile', 1),
 ('am', 1),
 ('company', 1),
 ('animal', 1),
 ('dog', 1),
 ('mic', 1),
 ('charger', 1),
 ('cover', 1),
 ('amanda', 1),
 ('any', 1),
 ('alarm', 1)]

## Actions - count()
Cuenta los elementos

In [19]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd.flatMap(lambda x: x.split(' ')).count()

19

## Actions - countByValue()

In [20]:
rdd = rdd.flatMap(lambda x: x.split(' ')).countByValue()

## saveAsTextFile()

In [21]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
print(rdd.getNumPartitions())
rdd.flatMap(lambda x: x.split(' ')).saveAsTextFile("output/saveAsText")

2


## coalesce() and repartition()

In [22]:
rdd = sc.textFile("data/spark 03 - filter quizz.txt")
rdd = rdd.repartition(5)
rdd.getNumPartitions()

5