In [1]:
# I am using this file to extract the required pages based on my DOB
from google.colab import files
uploaded = files.upload()

Saving harrypotter.pdf to harrypotter.pdf


In [2]:
#Installing required packages
# PyPDF2 :to read the PDF
# mrjob :to implement MapReduce
# pyspellchecker :to detect non-English words
!pip install PyPDF2 mrjob pyspellchecker

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting mrjob
  Downloading mrjob-0.7.4-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.4-py3-none-any.whl.metadata (9.4 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.6/439.6 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyspellchecker-0.8.4-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker, PyPDF2, mrjob
Successfully installed PyPDF2-3.0.1 mrjob-0.7.4 pyspellchecker-0.8.4


In [3]:
# Reading the PDF and extracting required pages

# DOB: 19-05-2003

from PyPDF2 import PdfReader

pdf = PdfReader("harrypotter.pdf")

# extracting a range of pages and save as a text file
def extract_pages(start, end, output_file):
    text = ""

    for i in range(start - 1, end):   # -1 because page index starts from 0
        text += pdf.pages[i].extract_text()

    # Saving extracted text into a file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)

# Birth date = 19 → file1 → pages 19 to 28
extract_pages(19, 28, "file1.txt")

# Birth year = 2003 → use 103 → file2 → pages 103 to 112
extract_pages(103, 112, "file2.txt")

print("Text files created successfully")

Text files created successfully


In [10]:
# Display first few characters to verify extraction

print(open("file1.txt").read(500))

“My	dear	Professor,	I’ve	never	seen	a	cat	sit	so	stiffly.”
“You’d	be	stiff	if	you’d	been	sitting	on	a	brick	wall	all	day,”	said	Professor
McGonagall.
“All	day?	When	you	could	have	been	celebrating?	I	must	have	passed	a
dozen	feasts	and	parties	on	my	way	here.”
Professor	McGonagall	sniffed	angrily.
“Oh	yes,	everyone’s	celebrating,	all	right,”	she	said	impatiently.	“You’d
think	they’d	be	a	bit	more	careful,	but	no	—	even	the	Muggles	have	noticed
something’s	going	on.	It	was	on	their	news.”	She	jer


In [4]:
%%file wordcount_mrjob.py

# This MapReduce job counts how many times each word appears in file1.txt

from mrjob.job import MRJob

class MRWordCount(MRJob):

    # Mapper: reads each line and emits (word, 1)
    def mapper(self, _, line):
        for word in line.split():
            yield word.lower(), 1

    # Reducer: adds all the values for the same word
    def reducer(self, key, values):
        yield key, sum(values)

if __name__ == "__main__":
    MRWordCount.run()

Writing wordcount_mrjob.py


In [7]:
# Running the MapReduce job for word count
!python wordcount_mrjob.py file1.txt > wordcount_output.txt


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/wordcount_mrjob.root.20260226.052403.799720
Running step 1 of 1...
job output is in /tmp/wordcount_mrjob.root.20260226.052403.799720/output
Streaming final output from /tmp/wordcount_mrjob.root.20260226.052403.799720/output...
Removing temp directory /tmp/wordcount_mrjob.root.20260226.052403.799720...


In [11]:
%%file nonenglish_mrjob.py

# MapReduce job to find non-English words

from mrjob.job import MRJob
from spellchecker import SpellChecker

spell = SpellChecker()

class MRNonEnglish(MRJob):

    def mapper(self, _, line):
        for word in line.split():
            clean = word.lower().strip(".,!?\"'")
            if clean and clean not in spell:
                yield clean, 1

    def reducer(self, key, values):
        yield key, sum(values)

if __name__ == "__main__":
    MRNonEnglish.run()

Writing nonenglish_mrjob.py


In [12]:
# Run the non-English word MapReduce job

!python nonenglish_mrjob.py file2.txt | head -20

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/nonenglish_mrjob.root.20260226.052949.142325
Running step 1 of 1...
job output is in /tmp/nonenglish_mrjob.root.20260226.052949.142325/output
Streaming final output from /tmp/nonenglish_mrjob.root.20260226.052949.142325/output...
Removing temp directory /tmp/nonenglish_mrjob.root.20260226.052949.142325...
"\u201cwe"	1
"\u201cwelcome"	1
"\u201cwhat"	2
"\u201cwhen"	1
"\u201cwould"	1
"\u201cyeh\u2019ll"	1
"\u201cyou\u2019d"	1
"\u201cyou\u2019ve"	1
"\u201d"	2
"again,\u201d"	1
"anexcuse"	1
"apart;"	1
"asteep"	1
"before?\u201d"	1
"black-haired"	1
"boat!\u201d"	1
"can\u2019t"	1
"cap!\u201d"	1
"ceremony\u2019s"	1
"change?\u201d"	1


In [13]:
# Saving the outputs for submission

!python wordcount_mrjob.py file1.txt > wordcount_output.txt
!python nonenglish_mrjob.py file2.txt > non_english_output.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/wordcount_mrjob.root.20260226.053114.607647
Running step 1 of 1...
job output is in /tmp/wordcount_mrjob.root.20260226.053114.607647/output
Streaming final output from /tmp/wordcount_mrjob.root.20260226.053114.607647/output...
Removing temp directory /tmp/wordcount_mrjob.root.20260226.053114.607647...
No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/nonenglish_mrjob.root.20260226.053115.354529
Running step 1 of 1...
job output is in /tmp/nonenglish_mrjob.root.20260226.053115.354529/output
Streaming final output from /tmp/nonenglish_mrjob.root.20260226.053115.354529/output...
Removing temp directory /tmp/nonenglish_mrjob.root.20260226.053115.354529...


In [14]:
# Download output files

files.download("wordcount_output.txt")
files.download("non_english_output.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>