In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Spark Metastore'). \
    master('yarn'). \
    getOrCreate()

### Accumulator

In [2]:
last_years_logs = [
  '2015-09-01 10:00:01|Error|Ac #3211001 ATW 10000 INR', 
  '2015-09-02 10:00:07|Info|Ac #3281001 ATW 11000 INR',
  '2015-10-01 10:00:09|error|Ac #3311001 AWT 10500 INR', 
  '2015-11-01 10:00:01|error|Ac #3211001 AWT 10000 INR',
  '2016-09-01 10:00:01|info|Ac #3211001 AWT 5000 INR', 
  '2016-09-02 10:00:01|ERROR|Ac #3211001 AWT 10000 INR',
  '2016-10-01 10:00:01|error|Ac #3211001 AWT 8000 INR', 
  '2016-11-01 10:00:01|error|Ac #3211001 AWT 10000 INR',
  '2016-12-01 10:00:01|Error|Ac #8211001 AWT 80000 INR', 
  '2016-12-02 10:00:01|error|Ac #9211001 AWT 90000 INR',
  '2016-12-10 10:00:01|error|Ac #3811001 AWT 15000 INR', 
  '2016-12-01 10:00:01|info|Ac #3219001 AWT 16000 INR'
]

In [3]:
last_years_logs_rdd = spark.sparkContext.parallelize(last_years_logs)

In [4]:
last_years_logs_rdd.collect()

['2015-09-01 10:00:01|Error|Ac #3211001 ATW 10000 INR',
 '2015-09-02 10:00:07|Info|Ac #3281001 ATW 11000 INR',
 '2015-10-01 10:00:09|error|Ac #3311001 AWT 10500 INR',
 '2015-11-01 10:00:01|error|Ac #3211001 AWT 10000 INR',
 '2016-09-01 10:00:01|info|Ac #3211001 AWT 5000 INR',
 '2016-09-02 10:00:01|ERROR|Ac #3211001 AWT 10000 INR',
 '2016-10-01 10:00:01|error|Ac #3211001 AWT 8000 INR',
 '2016-11-01 10:00:01|error|Ac #3211001 AWT 10000 INR',
 '2016-12-01 10:00:01|Error|Ac #8211001 AWT 80000 INR',
 '2016-12-02 10:00:01|error|Ac #9211001 AWT 90000 INR',
 '2016-12-10 10:00:01|error|Ac #3811001 AWT 15000 INR',
 '2016-12-01 10:00:01|info|Ac #3219001 AWT 16000 INR']

___Find error log___

In [5]:
error_log_count = 0
last_years_logs_rdd. \
    foreach(lambda x: (error_log_count+1) if 'error' in x.lower() else error_log_count)
print(error_log_count)

# Here output is 0 because, the error_log_lines counter is printing the local value from the driver as it is not a distributed counter.

0


In [6]:
error_log_count = spark.sparkContext.accumulator(0)

In [7]:
last_years_logs_rdd. \
    foreach(lambda x: (error_log_count.add(1) if 'error' in x.lower() else error_log_count.add(0)))
error_log_count.value

9

___How many error in 2016?___

In [8]:
error_cnt_2016 = last_years_logs_rdd. \
    filter(lambda x: 'error' in x.lower() and '2016' in x). \
    count()
print(error_cnt_2016)

6


### Word count problem

In [9]:
wordsRdd = spark.sparkContext.textFile('/user/itv736079/WordCount.txt')

In [10]:
wordsRdd.collect()

['I am learning Spark',
 'Spark uses RDDs',
 'RDDs are crude',
 'Spark is great',
 'spark is fun']

In [11]:
updRdd = wordsRdd. \
        flatMap(lambda x: x.split(" ")). \
        map(lambda x: x.lower()).\
        map(lambda x: (x,1)). \
        reduceByKey(lambda x,y : x+y)

In [12]:
updRdd.collect()

[('i', 1),
 ('am', 1),
 ('learning', 1),
 ('uses', 1),
 ('are', 1),
 ('is', 2),
 ('spark', 4),
 ('great', 1),
 ('fun', 1),
 ('rdds', 2),
 ('crude', 1)]

### map vs flatMap

In [13]:
mappedRdd = wordsRdd. \
        map(lambda x: x.split(" ")).collect()

mappedRdd

[['I', 'am', 'learning', 'Spark'],
 ['Spark', 'uses', 'RDDs'],
 ['RDDs', 'are', 'crude'],
 ['Spark', 'is', 'great'],
 ['spark', 'is', 'fun']]

In [14]:
flatMappedRdd = wordsRdd. \
        flatMap(lambda x: x.split(" ")).collect()

flatMappedRdd

['I',
 'am',
 'learning',
 'Spark',
 'Spark',
 'uses',
 'RDDs',
 'RDDs',
 'are',
 'crude',
 'Spark',
 'is',
 'great',
 'spark',
 'is',
 'fun']

### Find the sentence which is having the maximum number of words

In [15]:
topK=1
wordsRdd. \
    map(lambda x: x.split(" ")). \
    map(lambda x: (x, len(x))). \
    takeOrdered(topK, key = lambda sentence_wordslength: -sentence_wordslength[1])

[(['I', 'am', 'learning', 'Spark'], 4)]

In [16]:
newWordsRdd = spark.sparkContext.parallelize(["This is me", "This is a test", "Holy cow", "This sentence has most words"])

In [17]:
newWordsRdd. \
    map(lambda x: x.split(" ")). \
    map(lambda x: (x, len(x))). \
    takeOrdered(1, lambda x: -x[1])

[(['This', 'sentence', 'has', 'most', 'words'], 5)]

### Find the sentence which is having the least number of words

In [18]:
newWordsRdd. \
    map(lambda x: x.split(" ")). \
    map(lambda x: (x, len(x))). \
    takeOrdered(1, lambda x: x[1])

[(['Holy', 'cow'], 2)]

In [19]:
spark.sparkContext.defaultMinPartitions

2

In [20]:
spark.sparkContext.defaultParallelism

2

### textFile vs wholeTextFile

In [21]:
spark.sparkContext.textFile('/user/itv736079/WordCount.txt').collect()

['I am learning Spark',
 'Spark uses RDDs',
 'RDDs are crude',
 'Spark is great',
 'spark is fun']

In [22]:
spark.sparkContext.wholeTextFiles('/user/itv736079/WordCount.txt').collect()

[('hdfs://m01.itversity.com:9000/user/itv736079/WordCount.txt',
  'I am learning Spark\r\nSpark uses RDDs\r\nRDDs are crude\r\nSpark is great\r\nspark is fun')]

In [23]:
blank_rdd = spark.sparkContext.range(0, 0)

In [24]:
type(blank_rdd)

pyspark.rdd.RDD

In [25]:
num_rdd = spark.sparkContext.parallelize(range(1, 11))  # RDD of int
num_rdd.getNumPartitions() # int

2

In [26]:
num_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [27]:
num_rdd_parts = num_rdd.glom()
num_rdd_parts.collect()

# glom returns an RDD created by coalescing all elements within each partition into a list.
# since default partitions=2, it will create 2 lists

[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]

In [28]:
num_rdd = spark.sparkContext.parallelize(range(1, 11), 8)  # 8 partitions
num_rdd.getNumPartitions() # int

8

In [29]:
num_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [30]:
num_rdd_parts = num_rdd.glom()
num_rdd_parts.collect()

[[1], [2], [3], [4, 5], [6], [7], [8], [9, 10]]