<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/BigDataAssignment1A_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install required libraries
!pip install pyspark==3.5.6 spacy
!python -m spacy download en_core_web_sm

Collecting pyspark==3.5.6
  Downloading pyspark-3.5.6.tar.gz (317.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/317.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.6-py2.py3-none-any.whl size=317895798 sha256=a69b54882a4da0613cd2755371b70e88bf8e13d0b58eb7eb3eb8aa87ec989de2
  Stored in directory: /root/.cache/pip/wheels/64/62/f3/ec15656ea4ada0523cae62a1827fe7beb55d3c8c87174aad4a
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled pyspark-3.5.1
Successfully installed pyspark-3.5.6
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_cor

In [21]:
# Step 2: Import libraries, we are using Spacy
import spacy
from pyspark.sql import SparkSession

# Step 3: Initialize Spark session
spark = SparkSession.builder \
    .appName("NER-WordCount") \
    .getOrCreate()

sc = spark.sparkContext

# Step 4: Download a large text file from Gutenberg
# Example: "Pride and Prejudice" by Jane Austen
!wget -O dostoevsky.txt https://www.gutenberg.org/ebooks/2641.txt.utf-8

# Step 5: Load text into Spark RDD
rdd = sc.textFile("dostoevsky.txt")

# Step 6: Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Step 7: Define function to extract Named Entities
def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

# Step 8: Apply map-reduce (WordCount on Named Entities)
entities_rdd = rdd.flatMap(extract_entities) \
                  .map(lambda ent: (ent, 1)) \
                  .reduceByKey(lambda a, b: a + b) \
                  .sortBy(lambda x: (x[1], x[0]), ascending=False) # in case of tie.

# Step 9: Collect & display results
top_entities = entities_rdd.take(20)   # top 20 entities
for ent, count in top_entities:
    print(f"{ent} -> {count}")


--2025-09-28 04:54:48--  https://www.gutenberg.org/ebooks/2641.txt.utf-8
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://www.gutenberg.org/cache/epub/2641/pg2641.txt [following]
--2025-09-28 04:54:49--  http://www.gutenberg.org/cache/epub/2641/pg2641.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/2641/pg2641.txt [following]
--2025-09-28 04:54:50--  https://www.gutenberg.org/cache/epub/2641/pg2641.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 415864 (406K) [text/plain]
Saving to: ‘dostoevsky.txt’


2025-09-28 04:54:52 (462 KB/s) - ‘dostoevsky.txt’ saved [41

In [22]:
entities_df = entities_rdd.toDF(["text", "count"]).orderBy("count", ascending=False)
entities_df.show(30, truncate=False)

+-----------------+-----+
|text             |count|
+-----------------+-----+
|Lucy             |462  |
|Beebe            |196  |
|Bartlett         |167  |
|Freddy           |125  |
|Honeychurch      |102  |
|Emerson          |102  |
|George           |90   |
|Charlotte        |90   |
|one              |81   |
|two              |61   |
|Eager            |55   |
|Italy            |54   |
|Florence         |41   |
|first            |40   |
|Vyse             |40   |
|Miss Bartlett    |33   |
|Windy Corner     |30   |
|Alans            |30   |
|Project Gutenberg|29   |
|Emersons         |29   |
|Miss Lavish      |28   |
|Minnie           |27   |
|Greece           |27   |
|London           |26   |
|Rome             |25   |
|Italian          |21   |
|Gutenberg        |21   |
|Harry            |20   |
|English          |19   |
|Summer Street    |17   |
+-----------------+-----+
only showing top 30 rows



In [15]:
entities_rdd1 = rdd.flatMap(extract_entities)

entities_rdd1.take(5)


['the United States',
 'the Project Gutenberg License',
 'the United States',
 'E. M. Forster',
 'May 1, 2001']

In [16]:
entities_rdd2 = entities_rdd1.map(lambda ent: (ent, 1))

entities_rdd2.take(5)

[('the United States', 1),
 ('the Project Gutenberg License', 1),
 ('the United States', 1),
 ('E. M. Forster', 1),
 ('May 1, 2001', 1)]

In [17]:
entities_rdd3 = entities_rdd2.reduceByKey(lambda a, b: a + b)

entities_rdd3.take(5)

[('caterpillar', 1),
 ('first week', 1),
 ('Leonardesque', 1),
 ('John ascending', 1),
 ('next January', 1)]

In [20]:
entities_rdd4 = entities_rdd3.sortBy(lambda x: x[1], ascending=False,
                     numPartitions=entities_rdd3.getNumPartitions())


entities_rdd4.take(5)

[('Lucy', 462),
 ('Beebe', 196),
 ('Bartlett', 167),
 ('Freddy', 125),
 ('Emerson', 102)]