In [1]:
%%writefile mapper_wiki_parser.py

import sys
import re


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

for line in sys.stdin:
    try:
        article_id, text = line.strip().split('\t', 1)
    except ValueError as e:
        continue
    text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
    words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
    for word in words:
        eprint("reporter:counter:Wiki stats,Total words,%d" % 1)
        print("%s\t%d" % (word.lower(), 1))

Overwriting mapper_wiki_parser.py


In [2]:
%%writefile reducer_wiki_parser.py

import sys

current_key = None
word_sum = 0

for line in sys.stdin:
    try:
        key, count = line.strip().split('\t', 1)
        count = int(count)
    except ValueError as e:
        continue
    if current_key != key:
        if current_key:
            print("%s\t%d" % (current_key, word_sum))
        word_sum = 0
        current_key = key
    word_sum += count

if current_key:
    print("%s\t%d" % (current_key, word_sum))

Overwriting reducer_wiki_parser.py


In [3]:
%%writefile mapper_wiki_rater.py

import sys


for line in sys.stdin:
    try:
        word, count = line.strip().split('\t', 1)
    except ValueError as e:
        continue
    print("%s\t%s" % (count, word))

Overwriting mapper_wiki_rater.py


In [4]:
%%writefile reducer_wiki_rater.py

import sys


for line in sys.stdin:
    try:
        count, word = line.strip().split('\t', 1)
    except ValueError as e:
        continue
    print("%s\t%s" % (word, count))

Overwriting reducer_wiki_rater.py


In [5]:
%%bash

OUT_DIR_1="coursera_mr_task1_1"
OUT_DIR_2="coursera_mr_task1_2"
PARSER_NUM_REDUCERS=8
RATER_NUM_REDUCERS=1

hdfs dfs -rm -r -skipTrash ${OUT_DIR_1}* > /dev/null
hdfs dfs -rm -r -skipTrash ${OUT_DIR_2}* > /dev/null

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Streaming Word Count" \
    -D mapreduce.job.reduces=${PARSER_NUM_REDUCERS} \
    -files mapper_wiki_parser.py,reducer_wiki_parser.py \
    -mapper "python3 mapper_wiki_parser.py" \
    -combiner "python3 reducer_wiki_parser.py" \
    -reducer "python3 reducer_wiki_parser.py" \
    -input /data/wiki/en_articles_part \
    -output ${OUT_DIR_1} > /dev/null

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Streaming Word Rating" \
    -D mapreduce.job.reduces=${RATER_NUM_REDUCERS} \
    -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \
    -D mapreduce.partition.keycomparator.options="-nr" \
    -files mapper_wiki_rater.py,reducer_wiki_rater.py \
    -mapper "python3 mapper_wiki_rater.py" \
    -reducer "python3 reducer_wiki_rater.py" \
    -input ${OUT_DIR_1}/* \
    -output ${OUT_DIR_2} > /dev/null

# Code for obtaining the results
hdfs dfs -cat ${OUT_DIR_2}/part-00000 | sed -n '7p;8q'

is	126420


21/10/26 22:20:37 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
21/10/26 22:20:37 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
21/10/26 22:20:38 INFO mapred.FileInputFormat: Total input files to process : 1
21/10/26 22:20:38 INFO mapreduce.JobSubmitter: number of splits:2
21/10/26 22:20:38 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
21/10/26 22:20:38 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1635274654089_0036
21/10/26 22:20:39 INFO conf.Configuration: resource-types.xml not found
21/10/26 22:20:39 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
21/10/26 22:20:39 INFO resource.ResourceUtils: Adding resource type - name = memory-mb, units = Mi, type = COUNTABLE
21/10/26 22:20:39 INFO resource.ResourceUtils: Adding resource type - name = vcores, units = , type = COUNTABLE
21/10/26 22:20:39 INFO impl.Ya