In [1]:
pip install mrjob

Note: you may need to restart the kernel to use updated packages.


In [7]:
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRCountSongs(MRJob):
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper),
            MRStep(reducer=self.reducer)
        ]
    
    # Each line in will be read as a key, value
    # Each line has no key, so we ignore it with `_`
    def mapper(self, _, song):
        # Each line is a tuple: (song_names, 1) 
        yield (song, 1)

    # Combine all tuples with the same key
    def reducer(self, key, values):
        # Key is the song name
        # Sum up values in the tuple to get total song plays
        yield (key, sum(values))
        
# Runs this if I call it via the terminal        
#if __name__ == "__main__":
#  MRCountSongs.run()

In [11]:
! head romeoandjuliet.txt


ï»¿***The Project Gutenberg's Etext of Shakespeare's First Folio***
****************The Tragedie of Romeo and Juliet****************

This is our 3rd edition of most of these plays.  See the index.


Copyright laws are changing all over the world, be sure to check
the copyright laws for your country before posting these files!!



In [13]:
! python song_count.py romeoandjuliet.txt

"          (~), asterisk (*) and underline (_) characters may"	1
"          OR"	1
"          author, and additional characters may be used to"	1
"          be used to convey punctuation intended by the"	1
"          does *not* contain characters other than those"	1
"          etext in its original plain ASCII form (or in EBCDIC"	1
"          form by the program that displays the etext (as is"	1
"          indicate hypertext links; OR"	1
"          intended by the author of the work, although tilde"	1
"          no additional cost, fee or expense, a copy of the"	1
"          no expense into plain ASCII, EBCDIC or equivalent"	1
"          or other equivalent proprietary form)."	1
"          the case, for instance, with most word processors);"	1
"     *EITHER*:"	1
"     University\" within the 60 days following each"	1
"     [*]  The etext may be readily converted by the reader at"	1
"     [*]  The etext, when displayed, is clearly readable, and"	1
"     [*]  You provide, or agree to also

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\song_count.12146.20250607.011633.171090
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\12146\AppData\Local\Temp\song_count.12146.20250607.011633.171090\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\song_count.12146.20250607.011633.171090\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\song_count.12146.20250607.011633.171090...


In [15]:
! python song_count.py romeoandjuliet.txt --mapper

""	1
"\ufeff***The Project Gutenberg's Etext of Shakespeare's First Folio***"	1
"****************The Tragedie of Romeo and Juliet****************"	1
""	1
"This is our 3rd edition of most of these plays.  See the index."	1
""	1
""	1
"Copyright laws are changing all over the world, be sure to check"	1
"the copyright laws for your country before posting these files!!"	1
""	1
"Please take a look at the important information in this header."	1
"We encourage you to keep this file on your own disk, keeping an"	1
"electronic path open for the next readers.  Do not remove this."	1
""	1
""	1
"**Welcome To The World of Free Plain Vanilla Electronic Texts**"	1
""	1
"**Etexts Readable By Both Humans and By Computers, Since 1971**"	1
""	1
"*These Etexts Prepared By Hundreds of Volunteers and Donations*"	1
""	1
"Information on contacting Project Gutenberg to get Etexts, and"	1
"further information is included below.  We need your donations."	1
""	1
""	1
"The Tragedie of Romeo and Juliet"	1
""	1
"by W

In [17]:
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRmyjob(MRJob):
	def mapper(self, _, line):
		wordlist = line.split()
		for word in wordlist:
			yield word,1

	def reducer(self, key, list_of_values):
		yield None, (sum(list_of_values),key)

	def reducer2(self, _, list_of_values):
		yield max(list_of_values)

	def steps(self):
		return [MRStep(mapper=self.mapper, reducer=self.reducer), MRStep( reducer=self.reducer2)]


In [21]:
! python wordcountmax.py romeoandjuliet.txt

685	"the"


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\wordcountmax.12146.20250607.012059.719924
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\12146\AppData\Local\Temp\wordcountmax.12146.20250607.012059.719924\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\wordcountmax.12146.20250607.012059.719924\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\wordcountmax.12146.20250607.012059.719924...


In [23]:
from mrjob.job import MRJob
import string

class MRWordFreqCount(MRJob):

    def mapper(self, _, line):
        line = line.strip()
        # Remove punctuations
        for s in string.punctuation:
            line = line.replace(s, '')
        words = line.split()
        for word in words:
            yield (word.lower(), 1)

    def reducer(self, word, counts):
        yield (word, sum(counts))


In [25]:
! python wordcount.py romeoandjuliet.txt

"1"	8
"10"	1
"10000"	1
"100000000"	1
"1971"	1
"1999"	1
"2"	7
"20"	1
"200"	1
"2000"	2
"2001"	2
"2261"	1
"2261txt"	1
"2261zip"	1
"2782"	1
"2m"	2
"2wife"	1
"3"	3
"30"	2
"31"	1
"3333"	1
"3mu"	1
"3rd"	1
"3wat"	1
"432"	1
"5"	1
"60"	1
"61825"	1
"7"	1
"90"	1
"\ufeffthe"	1
"a"	516
"abate"	1
"abbey"	1
"abbreviations"	1
"abhorred"	1
"abhors"	1
"able"	2
"aboard"	1
"aboue"	7
"aboundst"	1
"about"	19
"abouts"	1
"above"	3
"abra"	5
"abraham"	1
"abroach"	1
"abroad"	4
"absolud"	1
"abusd"	1
"abuse"	1
"abuses"	1
"accent"	1
"accept"	1
"accepts"	1
"access"	1
"accesse"	1
"accident"	1
"accidents"	1
"accorded"	1
"according"	2
"account"	3
"accurst"	1
"accustomd"	1
"acknowledge"	1
"acquaint"	1
"acquaintance"	1
"across"	1
"act"	2
"acted"	1
"acting"	1
"action"	1
"acts"	1
"actually"	1
"actus"	1
"adde"	1
"added"	2
"adding"	1
"addition"	2
"additional"	2
"addle"	1
"address"	1
"adiacent"	1
"admired"	1
"adoe"	1
"aduance"	1
"aduanced"	1
"aduanst"	2
"adue"	4
"aduenture"	2
"aduersarie"	1
"aduersities"	1
"aduise"	1
"advance"

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\wordcount.12146.20250607.012213.038186
Running step 1 of 1...
job output is in C:\Users\12146\AppData\Local\Temp\wordcount.12146.20250607.012213.038186\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\wordcount.12146.20250607.012213.038186\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\wordcount.12146.20250607.012213.038186...


In [27]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import string

class MRWordFreqCount(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_words)
        ]

    def mapper_get_words(self, _, line):
        # Updated set of words to exclude
        exclude_words = set(['the', 'a', 'an', 'and', 'or', 'i', 'to', 'of', 'my', 'that'])

        line = line.strip().lower()
        # Remove punctuations
        for s in string.punctuation:
            line = line.replace(s, '')
        words = line.split()
        for word in words:
            if word not in exclude_words:
                yield (word, 1)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_words(self, _, word_counts):
        # Sort the word_counts in descending order and take the top 5
        top_words = sorted(word_counts, reverse=True)[:5]
        for count, word in top_words:
            yield (word, count)

In [29]:
! python top5.py romeoandjuliet.txt

"is"	371
"you"	355
"in"	341
"thou"	277
"not"	274


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\top5.12146.20250607.012424.390717
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\12146\AppData\Local\Temp\top5.12146.20250607.012424.390717\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\top5.12146.20250607.012424.390717\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\top5.12146.20250607.012424.390717...


In [31]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import string

class MRWordFreqCount(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_words)
        ]

    def mapper_get_words(self, _, line):
        # Updated set of words to exclude
        exclude_words = set(['the', 'a', 'an', 'and', 'or', 'i', 'to', 'of', 'my', 'that'])

        line = line.strip().lower()
        # Remove punctuations
        for s in string.punctuation:
            line = line.replace(s, '')
        words = line.split()
        for word in words:
            if word not in exclude_words:
                yield (word, 1)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_words(self, _, word_counts):
        # Sort the word_counts in descending order and take the top 10
        top_words = sorted(word_counts, reverse=True)[:10]
        for count, word in top_words:
            yield (word, count)

In [33]:
! python top10.py romeoandjuliet.txt

"is"	371
"you"	355
"in"	341
"thou"	277
"not"	274
"this"	270
"me"	266
"with"	255
"it"	254
"for"	251


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\top10.12146.20250607.012656.301600
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\12146\AppData\Local\Temp\top10.12146.20250607.012656.301600\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\top10.12146.20250607.012656.301600\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\top10.12146.20250607.012656.301600...


In [35]:
from mrjob.job import MRJob
import string

class MRRomeoJulietCount(MRJob):
    def mapper(self, _, line):
        line = line.strip()
        # Remove punctuations
        for s in string.punctuation:
            line = line.replace(s, '')
        words = line.split()
        for word in words:
            word = word.lower()
            if word in ['romeo', 'iuliet']:
                yield (word, 1)

    def reducer(self, word, counts):
        yield (word, sum(counts))

In [37]:
! python countrj.py romeoandjuliet.txt

"iuliet"	54
"romeo"	143


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\12146\AppData\Local\Temp\countrj.12146.20250607.012845.832633
Running step 1 of 1...
job output is in C:\Users\12146\AppData\Local\Temp\countrj.12146.20250607.012845.832633\output
Streaming final output from C:\Users\12146\AppData\Local\Temp\countrj.12146.20250607.012845.832633\output...
Removing temp directory C:\Users\12146\AppData\Local\Temp\countrj.12146.20250607.012845.832633...


In [39]:
! tail romeoandjuliet.txt

Some shall be pardon'd, and some punished.
For neuer was a Storie of more Wo,
Then this of Iuliet, and her Romeo.

Exeunt. omnes

FINIS. THE TRAGEDIE OF ROMEO and IVLIET
romeoandjuliet.txt

Displaying romeoandjuliet.txt.
