<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/BigDataAssignment1A_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from urllib.request import urlopen
import re, nltk
from nltk.tree import Tree

In [None]:
url = "https://www.gutenberg.org/ebooks/2641.txt.utf-8"  # Dostoevsky - The Idiot
raw = urlopen(url).read().decode("utf-8", errors="ignore")

In [None]:
start = re.search(r"\*\*\* START OF(.*)\*\*\*", raw)
end   = re.search(r"\*\*\* END OF(.*)\*\*\*", raw)
text  = raw[start.end():end.start()] if start and end else raw

In [None]:
nltk.download("punkt_tab", quiet=True)
sents = nltk.sent_tokenize(text)

In [None]:
len(sents)

4335

In [None]:
!pip -q install pyspark

In [None]:
from pyspark.sql import SparkSession

# Start a session (uses local[*] by default in Colab)
spark = SparkSession.builder.appName("colab-spark").getOrCreate()
sc = spark.sparkContext

In [None]:
# parallelize the sentences in 8 partitions
rdd = sc.parallelize(sents, numSlices=8)

In [None]:
print("No. of sentences -->")
rdd.count()

No. of sentences -->


4335

In [None]:
print("No. of sentences partition wise -->")
rdd.glom().map(len).collect()


No. of sentences partition wise -->


[541, 541, 541, 541, 541, 541, 541, 548]

In [None]:
def ents_in_partition(iter_sents):
    import nltk
    from nltk.tree import Tree
    for sent in iter_sents:                # iterate sentences in this partition (streaming)
        tokens = nltk.word_tokenize(sent)  # word tokens
        pos = nltk.pos_tag(tokens)         # POS tags (NNP, VBD, etc.)
        tree = nltk.ne_chunk(pos, binary=False)  # NER chunk tree with labels like PERSON/GPE
        for subtree in tree:               # top-level children of the chunk tree
            if isinstance(subtree, Tree):  # only take named-entity spans (skip plain tokens)
                label = subtree.label()    # e.g., PERSON / ORGANIZATION / GPE
                text = " ".join(tok for tok, _ in subtree.leaves())  #
                yield (label, text)        # stream one record per entity

In [None]:
ents_rdd = rdd.mapPartitions(ents_in_partition)  # RDD[(label, text)]

In [None]:
tagged = rdd.mapPartitionsWithIndex(
    lambda pid, it: ((pid, label, text) for (label, text) in ents_in_partition(it))
)

tagged.take(5)   # -> [(pid, label, text), ...]

[(0, 'ORGANIZATION', 'CONTENTS Part One'),
 (0, 'ORGANIZATION', 'Bertolini'),
 (0, 'GPE', 'Santa'),
 (0, 'ORGANIZATION', 'No Baedeker'),
 (0, 'GPE', 'Music')]

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
ents_rdd.take(5)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


[('ORGANIZATION', 'CONTENTS Part One'),
 ('ORGANIZATION', 'Bertolini'),
 ('GPE', 'Santa'),
 ('ORGANIZATION', 'No Baedeker'),
 ('GPE', 'Music')]

In [None]:
counts = (ents_rdd
          .map(lambda x: (x, 1))                     # ((label, text), 1)
          .reduceByKey(lambda a, b: a + b)           # ((label, text), total)
          .map(lambda kv: (kv[0][0], kv[0][1], kv[1]))  # (type, text, count)
)

In [None]:
ents_rdd.take(5)

[('ORGANIZATION', 'CONTENTS Part One'),
 ('ORGANIZATION', 'Bertolini'),
 ('GPE', 'Santa'),
 ('ORGANIZATION', 'No Baedeker'),
 ('GPE', 'Music')]

In [None]:
ents_rdd1=ents_rdd.map(lambda x:(x,1))
ents_rdd1.take(5)

[(('ORGANIZATION', 'CONTENTS Part One'), 1),
 (('ORGANIZATION', 'Bertolini'), 1),
 (('GPE', 'Santa'), 1),
 (('ORGANIZATION', 'No Baedeker'), 1),
 (('GPE', 'Music'), 1)]

In [None]:
ents_rdd2 = ents_rdd1.reduceByKey(lambda a,b : a+b)
ents_rdd2.take(5)

[(('ORGANIZATION', 'Reverend Cuthbert Eager'), 2),
 (('PERSON', 'Miss Eleanor Lavish'), 2),
 (('GPE', 'Carriages'), 2),
 (('PERSON', 'Chapter'), 12),
 (('PERSON', 'Chapter IX'), 1)]

In [None]:
ents_rdd3 = ents_rdd2.map(lambda kv: (kv[0][0], kv[0][1], kv[1]))
ents_rdd3.take(5)

[('ORGANIZATION', 'Reverend Cuthbert Eager', 2),
 ('PERSON', 'Miss Eleanor Lavish', 2),
 ('GPE', 'Carriages', 2),
 ('PERSON', 'Chapter', 12),
 ('PERSON', 'Chapter IX', 1)]

In [None]:
df = counts.toDF(["type", "text", "count"]).orderBy("count", ascending=False)
df.show(30, truncate=False)

+------------+----------------+-----+
|type        |text            |count|
+------------+----------------+-----+
|PERSON      |Lucy            |351  |
|PERSON      |Cecil           |189  |
|PERSON      |Mr. Beebe       |142  |
|PERSON      |Miss Bartlett   |138  |
|PERSON      |George          |108  |
|PERSON      |Mr. Emerson     |71   |
|PERSON      |Charlotte       |67   |
|PERSON      |Mr.             |65   |
|GPE         |Freddy          |64   |
|PERSON      |Miss Lavish     |60   |
|GPE         |Lucy            |58   |
|PERSON      |Freddy          |57   |
|GPE         |Italy           |48   |
|PERSON      |Miss            |45   |
|PERSON      |Bartlett        |44   |
|GPE         |Florence        |41   |
|PERSON      |Miss Honeychurch|37   |
|GPE         |Cecil           |34   |
|ORGANIZATION|Miss Alans      |30   |
|PERSON      |Beebe           |28   |
|GPE         |Eager           |27   |
|GPE         |Greece          |27   |
|PERSON      |Eager           |26   |
|PERSON     