### Objective:
- word count from the text file using `pyspark` to create RDD

In [2]:
pip install textblob tweepy nltk

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting tweepy
  Downloading tweepy-4.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting requests-oauthlib<2,>=1.2.0 (from tweepy)
  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tweepy-4.14.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m20.0 MB/s[0m eta 

In [3]:
from pyspark import SparkConf, SparkContext
from operator import add
from textblob.utils import strip_punc
from nltk.corpus import stopwords
from operator import itemgetter

### Create Configurations for pyspark

## using cluster; we use `cluster URL`
- here, we use local[*] that will specifiy to use all cluster available to execute the thread
- EG: with local computer 4 core, it uses 4 executor to worker nodes 

In [4]:
cluster_url = 'local[*]'
configuration = SparkConf()\
    .setAppName('RomeoJulietCounter')\
    .setMaster(cluster_url)

In [5]:
## Create the configurations
sc= SparkContext(conf= configuration)

### Reading text file with pyspark
- Create pipeline to transform the data and count the total counts

In [6]:
tokenized= sc.textFile('Romeo and Juliet.txt')\
            .flatMap(lambda line: line.lower().split())\
            .map(lambda wd: strip_punc(wd, all= True))

### Filter the Words by excluding STOP_WORDS

In [7]:
# pip install nltk

In [8]:
# create set of stop words
stop_words= open("english.txt", "r")
stop_words = stop_words.read().split("\n")

In [9]:
### Filter the words
filtered = tokenized.filter(lambda word: word not in stop_words)

### Filter RDD by mapReduce
- Reduce by key to count the total unique words by using `add` function``

In [10]:
## creates a tuple for every unique word
## second, reduce by key to count by unique words
word_counts = filtered.map(lambda wd: (wd, 1))\
    .reduceByKey(add)

### Filter by second index
- `Total Counts` for each unqiue words
- filter the words that has length less than 60

In [11]:
filtered_counts= word_counts.filter(lambda wd: wd[1] >=60)

### Sorting by Descending order
- using `collect()` method will trigger all assigned methods

In [12]:
sorted_items = sorted(filtered_counts.collect(), key= itemgetter(1), reverse = True)

### Dispaly the sorted count words

In [13]:
max_len = max([ len(word) for word, count in sorted_items])
for word, count in sorted_items:
    #print(f'{word > {max_len}} : {count}')
    print(word, count)

thou 277
thy 164
rom 163
nurse 150
romeo 143
thee 138
love 137
jul 117
shall 112
come 99
friar 92
project 89
ill 84
enter 82
good 82
go 76
man 72
well 69
death 69
lady 68
night 68
juliet 65
may 65
ben 64
hath 64
one 62
mer 62


In [14]:
max_len

7