# PySpark Part of Speech (POS) analysis
Text taken from [Reuters](https://www.reuters.com/business/finance/banks-beware-outsiders-are-cracking-code-finance-2021-09-17/).

In [1]:
import findspark
findspark.init()

In [2]:
import nltk
from pyspark import SparkContext

In [3]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
sc = SparkContext("spark://dankpad:7077", appName = "pyspark-pos-analysis-astagg")

In [5]:
# Loading a text file
rdd_reuters = sc.textFile("./data/reuters.txt")

In [6]:
rdd_reuters.count()

87

In [28]:
map_count = {}
for line in rdd_reuters.collect():
    line_tokens = nltk.word_tokenize(line)
    tags = nltk.pos_tag(line_tokens)
    for (_, wtype) in tags:
        if wtype in map_count:
            map_count[wtype] += 1
        else:
            map_count[wtype] = 0

In [29]:
wtype_translate = {
    'CC': 'coordinating conjunction',
    'CD': 'cardinal digit',
    'DT': 'determiner',
    'EX': 'existential there (like: “there is” … think of it like “there exists”)',
    'FW': 'foreign word',
    'IN': 'preposition/subordinating conjunction',
    'JJ': 'adjective ‘big’',
    'JJR': 'adjective, comparative ‘bigger’',
    'JJS': 'adjective, superlative ‘biggest’',
    'LS': 'list marker 1',
    'MD': 'modal could, will',
    'NN': 'noun, singular ‘desk’',
    'NNS': 'noun plural ‘desks’',
    'NNP':'proper noun, singular ‘Harrison’',
    'NNPS': 'proper noun, plural ‘Americans’',
    'PDT': 'predeterminer \'all the kids\'',
    'POS': 'possessive ending parent\'s',
    'PRP': 'personal pronoun I, he, she',
    'PRP$': 'possessive pronoun my, his, hers',
    'RB': 'adverb very, silently',
    'RBR': 'adverb, comparative better',
    'RBS': 'adverb, superlative best',
    'RP': 'particle give up',
    'TO': 'to go ‘to’ the store',
    'UH': 'interjection, errrrrrrrm',
    'VB': 'verb, base form take',
    'VBD': 'verb, past tense took',
    'VBG': 'verb, gerund/present participle taking',
    'VBN': 'verb, past participle taken',
    'VBP': 'verb, sing. present, non-3d take',
    'VBZ': 'verb, 3rd person sing. present takes',
    'WDT': 'wh-determiner which',
    'WP': 'wh-pronoun who, what',
    'WP$': 'possessive wh-pronoun whose',
    'WRB': 'wh-abverb where, when',
}

In [42]:
print('En el texto hay:')
for k, v in map_count.items():
    if k in wtype_translate:
        word_type = wtype_translate[k]
    else:
        continue

    print(f'{v} {word_type}')

En el texto hay:
125 noun plural ‘desks’
167 noun, singular ‘desk’
126 proper noun, singular ‘Harrison’
44 coordinating conjunction
40 verb, sing. present, non-3d take
27 verb, gerund/present participle taking
81 determiner
142 preposition/subordinating conjunction
34 cardinal digit
0 proper noun, plural ‘Americans’
18 modal could, will
60 verb, base form take
41 personal pronoun I, he, she
50 adverb very, silently
87 adjective ‘big’
41 to go ‘to’ the store
6 particle give up
25 verb, past participle taken
33 verb, past tense took
36 verb, 3rd person sing. present takes
7 wh-abverb where, when
12 possessive pronoun my, his, hers
2 adjective, superlative ‘biggest’
7 wh-determiner which
0 predeterminer 'all the kids'
0 wh-pronoun who, what
9 possessive ending parent's
4 adjective, comparative ‘bigger’
0 adverb, superlative best
0 existential there (like: “there is” … think of it like “there exists”)
0 adverb, comparative better
