<a href="https://colab.research.google.com/github/serdarbozoglan/My_Pyspark/blob/master/SparkNLP_Training01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.4.5

openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 69kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 52.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130388 sha256=2bef30878b01f7c702686d85111ee7d4084340af0e3da14e08562b0235a15026
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

In [0]:
import pandas as pd

## Start Spark

In [3]:
import sparknlp
spark = sparknlp.start()

print('Spark NLP Version : ', sparknlp.version())
print('Apache Spark Version :', spark.version)

Spark NLP Version :  2.4.5
Apache Spark Version : 2.4.4


## Using Pretrained Pipelines

https://github.com/JohnSnowLabs/spark-nlp-models

In [0]:
from sparknlp.pretrained import PretrainedPipeline

In [5]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]


In [0]:
testDoc = """French author who helped pioner the science-fiction genre. \
Verne wrate about space, air, and underwater travel before navigable aircrast and \
practical submarines were invented, and before any means of space travel had been devised."""

In [7]:
testDoc

'French author who helped pioner the science-fiction genre. Verne wrate about space, air, and underwater travel before navigable aircrast and practical submarines were invented, and before any means of space travel had been devised.'

In [8]:
%%time

result = pipeline.annotate(testDoc)

CPU times: user 34.9 ms, sys: 9.77 ms, total: 44.6 ms
Wall time: 2.26 s


In [9]:
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [10]:
result['sentence']

['French author who helped pioner the science-fiction genre.',
 'Verne wrate about space, air, and underwater travel before navigable aircrast and practical submarines were invented, and before any means of space travel had been devised.']

In [11]:
result['token']

['French',
 'author',
 'who',
 'helped',
 'pioner',
 'the',
 'science-fiction',
 'genre',
 '.',
 'Verne',
 'wrate',
 'about',
 'space',
 ',',
 'air',
 ',',
 'and',
 'underwater',
 'travel',
 'before',
 'navigable',
 'aircrast',
 'and',
 'practical',
 'submarines',
 'were',
 'invented',
 ',',
 'and',
 'before',
 'any',
 'means',
 'of',
 'space',
 'travel',
 'had',
 'been',
 'devised',
 '.']

In [12]:
list(zip(result['token'], result['pos']))

[('French', 'JJ'),
 ('author', 'NN'),
 ('who', 'WP'),
 ('helped', 'VBD'),
 ('pioner', 'NN'),
 ('the', 'DT'),
 ('science-fiction', 'NN'),
 ('genre', 'NN'),
 ('.', '.'),
 ('Verne', 'NNP'),
 ('wrate', 'VBD'),
 ('about', 'IN'),
 ('space', 'NN'),
 (',', ','),
 ('air', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('underwater', 'JJ'),
 ('travel', 'NN'),
 ('before', 'IN'),
 ('navigable', 'JJ'),
 ('aircrast', 'NN'),
 ('and', 'CC'),
 ('practical', 'JJ'),
 ('submarines', 'NNS'),
 ('were', 'VBD'),
 ('invented', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('before', 'IN'),
 ('any', 'DT'),
 ('means', 'NNS'),
 ('of', 'IN'),
 ('space', 'NN'),
 ('travel', 'NN'),
 ('had', 'VBD'),
 ('been', 'VBN'),
 ('devised', 'VBN'),
 ('.', '.')]

In [13]:
list(zip(result['token'], result['lemmas'], result['stems'], result['spell']))

[('French', 'French', 'french', 'French'),
 ('author', 'author', 'author', 'author'),
 ('who', 'who', 'who', 'who'),
 ('helped', 'help', 'help', 'helped'),
 ('pioner', 'pioneer', 'pioneer', 'pioneer'),
 ('the', 'the', 'the', 'the'),
 ('science-fiction', 'sciencefiction', 'sciencefict', 'sciencefiction'),
 ('genre', 'genre', 'genr', 'genre'),
 ('.', '.', '.', '.'),
 ('Verne', 'Verne', 'vern', 'Verne'),
 ('wrate', 'write', 'wrote', 'wrote'),
 ('about', 'about', 'about', 'about'),
 ('space', 'space', 'space', 'space'),
 (',', ',', ',', ','),
 ('air', 'air', 'air', 'air'),
 (',', ',', ',', ','),
 ('and', 'and', 'and', 'and'),
 ('underwater', 'underwater', 'underwat', 'underwater'),
 ('travel', 'travel', 'travel', 'travel'),
 ('before', 'before', 'befor', 'before'),
 ('navigable', 'navigable', 'navig', 'navigable'),
 ('aircrast', 'aircraft', 'aircraft', 'aircraft'),
 ('and', 'and', 'and', 'and'),
 ('practical', 'practical', 'practic', 'practical'),
 ('submarines', 'submarine', 'submarin', '

In [14]:
df = pd.DataFrame({"token":result['token'],
                   "corrected":result['spell'],
                   'POS':result['pos'],
                   'lemmas':result['lemmas'],
                   'stems':result['stems']}) 
df

Unnamed: 0,token,corrected,POS,lemmas,stems
0,French,French,JJ,French,french
1,author,author,NN,author,author
2,who,who,WP,who,who
3,helped,helped,VBD,help,help
4,pioner,pioneer,NN,pioneer,pioneer
5,the,the,DT,the,the
6,science-fiction,sciencefiction,NN,sciencefiction,sciencefict
7,genre,genre,NN,genre,genr
8,.,.,.,.,.
9,Verne,Verne,NNP,Verne,vern


## Explain Document DL

In [15]:
pipeline_dl = PretrainedPipeline('explain_document_dl', 'en')

explain_document_dl download started this may take some time.
Approx size to download 168.4 MB
[OK!]


In [16]:
%%time
result = pipeline_dl.annotate(testDoc)

CPU times: user 33.1 ms, sys: 12.7 ms, total: 45.8 ms
Wall time: 724 ms


In [17]:
result.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [18]:
result['entities']

['French', 'Verne']

In [19]:
df = pd.DataFrame({"token":result['token'],
                   "ner_label":result['ner'],
                   "spell_corrected":result['checked'],
                   'POS':result['pos'],
                   'lemma':result['lemma'],
                   'stem':result['stem']}) 
df

Unnamed: 0,token,ner_label,spell_corrected,POS,lemma,stem
0,French,B-MISC,French,JJ,French,french
1,author,O,author,NN,author,author
2,who,O,who,WP,who,who
3,helped,O,helped,VBD,help,help
4,pioner,O,pioneer,NN,pioneer,pioneer
5,the,O,the,DT,the,the
6,science-fiction,O,sciencefiction,NN,sciencefiction,sciencefict
7,genre,O,genre,NN,genre,genr
8,.,O,.,.,.,.
9,Verne,B-PER,Verne,NNP,Verne,vern


## Spell Checking

In [20]:
spell_checker = PretrainedPipeline('check_spelling', 'en')

check_spelling download started this may take some time.
Approx size to download 892.6 KB
[OK!]


In [0]:
result = spell_checker.annotate(testDoc)

In [22]:
result.keys()

dict_keys(['document', 'sentence', 'token', 'checked'])

In [23]:
list(zip(result['token'], result['checked']))

[('French', 'French'),
 ('author', 'author'),
 ('who', 'who'),
 ('helped', 'helped'),
 ('pioner', 'pioneer'),
 ('the', 'the'),
 ('science-fiction', 'science-fiction'),
 ('genre', 'genre'),
 ('.', '.'),
 ('Verne', 'Vern'),
 ('wrate', 'wrote'),
 ('about', 'about'),
 ('space', 'space'),
 (',', ','),
 ('air', 'air'),
 (',', ','),
 ('and', 'and'),
 ('underwater', 'underwater'),
 ('travel', 'travel'),
 ('before', 'before'),
 ('navigable', 'navigable'),
 ('aircrast', 'aircraft'),
 ('and', 'and'),
 ('practical', 'practical'),
 ('submarines', 'submarines'),
 ('were', 'were'),
 ('invented', 'invented'),
 (',', ','),
 ('and', 'and'),
 ('before', 'before'),
 ('any', 'any'),
 ('means', 'means'),
 ('of', 'of'),
 ('space', 'space'),
 ('travel', 'travel'),
 ('had', 'had'),
 ('been', 'been'),
 ('devised', 'devised'),
 ('.', '.')]

## Parsing a list of texts

In [0]:
testDoc_list = [ 'French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.']

In [0]:
result_list = pipeline.annotate(testDoc_list)

In [26]:
len(result_list)

3

In [27]:
result_list[0]

{'document': ['French author who helped pioner the science-fiction genre.'],
 'lemmas': ['French',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'pos': ['JJ', 'NN', 'WP', 'VBD', 'NN', 'DT', 'NN', 'NN', '.'],
 'sentence': ['French author who helped pioner the science-fiction genre.'],
 'spell': ['French',
  'author',
  'who',
  'helped',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'stems': ['french',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefict',
  'genr',
  '.'],
 'token': ['French',
  'author',
  'who',
  'helped',
  'pioner',
  'the',
  'science-fiction',
  'genre',
  '.']}

## Using fullAnnotate to get more details

In [0]:
text = 'Peter Parker is a nice guy and lives in New York'

In [0]:
detailed_result = pipeline_dl.fullAnnotate(text)

In [30]:
detailed_result

[{'checked': [Annotation(token, 0, 4, Peter, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 6, 11, Parker, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 13, 14, is, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 16, 16, a, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 18, 21, nice, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 23, 25, guy, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 27, 29, and, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 31, 35, lives, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 37, 38, in, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 40, 42, New, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 44, 47, York, {'confidence': '1.0', 'sentence': '0'})],
  'document': [Annotation(document, 0, 47, Peter Parker is a nice guy and lives in New York, {})],
  'embeddings': [Annotation(word_embeddings, 0, 4, Peter, {'is

In [31]:
detailed_result[0]['entities']

[Annotation(chunk, 0, 11, Peter Parker, {'entity': 'PER', 'sentence': '0', 'chunk': '0'}),
 Annotation(chunk, 40, 47, New York, {'entity': 'LOC', 'sentence': '0', 'chunk': '1'})]

In [32]:
detailed_result[0]['entities'][0]

Annotation(chunk, 0, 11, Peter Parker, {'entity': 'PER', 'sentence': '0', 'chunk': '0'})

In [33]:
# To reach the value we use .result method
detailed_result[0]['entities'][0].result

'Peter Parker'

In [34]:
detailed_result[0]['entities'][0].metadata['entity']

'PER'

In [0]:
chunks = []
entities = []

for n in detailed_result[0]['entities']:
    chunks.append(n.result)
    entities.append(n.metadata['entity'])

df = pd.DataFrame({'chunks':chunks, 'entities':entities})

In [37]:
df

Unnamed: 0,sent_id,token,start,end,pos,ner
0,0,Peter,0,4,NNP,B-PER
1,0,Parker,6,11,NNP,I-PER
2,0,is,13,14,VBZ,O
3,0,a,16,16,DT,O
4,0,nice,18,21,JJ,O
5,0,guy,23,25,NN,O
6,0,and,27,29,CC,O
7,0,lives,31,35,NNS,O
8,0,in,37,38,IN,O
9,0,New,40,42,NNP,B-LOC


In [36]:
tuples = []

for x, y, z in zip(detailed_result[0]["token"], detailed_result[0]['pos'], detailed_result[0]['ner']):
    tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, z.result))

df = pd.DataFrame(tuples, columns=['sent_id', 'token', 'start', 'end', 'pos', 'ner'])
df

Unnamed: 0,sent_id,token,start,end,pos,ner
0,0,Peter,0,4,NNP,B-PER
1,0,Parker,6,11,NNP,I-PER
2,0,is,13,14,VBZ,O
3,0,a,16,16,DT,O
4,0,nice,18,21,JJ,O
5,0,guy,23,25,NN,O
6,0,and,27,29,CC,O
7,0,lives,31,35,NNS,O
8,0,in,37,38,IN,O
9,0,New,40,42,NNP,B-LOC


## Use Pretrained **match_chunk** Pipeline for individual Noun Phrase

pipeline uses the regex pattern of `<DT>?<JJ>*<NN>+` (yani determinat (DT) etiher the or a might occur, one or more adjective (JJ) and noun (NN) 

In [54]:
pipeline = PretrainedPipeline('match_chunks', 'en')


match_chunks download started this may take some time.
Approx size to download 4.3 MB
[OK!]


In [0]:
result = pipeline.annotate('The book has many chapters. The red car seems awesome')

In [56]:
result

{'chunk': ['The book', 'The red car'],
 'document': ['The book has many chapters. The red car seems awesome'],
 'pos': ['DT', 'NN', 'VBZ', 'JJ', 'NNS', '.', 'DT', 'JJ', 'NN', 'VBZ', 'JJ'],
 'sentence': ['The book has many chapters.', 'The red car seems awesome'],
 'token': ['The',
  'book',
  'has',
  'many',
  'chapters',
  '.',
  'The',
  'red',
  'car',
  'seems',
  'awesome']}

In [0]:
result = pipeline.annotate('the little brown pitty bird was trying to fly and the green dog was barking at it')

In [58]:
result

{'chunk': ['the little brown pitty bird', 'the green dog'],
 'document': ['the little brown pitty bird was trying to fly and the green dog was barking at it'],
 'pos': ['DT',
  'JJ',
  'JJ',
  'NN',
  'NN',
  'VBD',
  'VBG',
  'TO',
  'VB',
  'CC',
  'DT',
  'JJ',
  'NN',
  'VBD',
  'VBG',
  'IN',
  'PRP'],
 'sentence': ['the little brown pitty bird was trying to fly and the green dog was barking at it'],
 'token': ['the',
  'little',
  'brown',
  'pitty',
  'bird',
  'was',
  'trying',
  'to',
  'fly',
  'and',
  'the',
  'green',
  'dog',
  'was',
  'barking',
  'at',
  'it']}

In [43]:
result['chunk']

['the little brown pitty bird']

## Extract Exact Dates afrom Referential Date Phrahes

In [59]:
pipeline = PretrainedPipeline('match_datetime', 'en')

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [0]:
result = pipeline.annotate('I saw him yesterday and he told me that he would visit us next week.')

In [61]:
result

{'date': ['2020/04/27', '2020/04/19'],
 'document': ['I saw him yesterday and he told me that he would visit us next week.'],
 'sentence': ['I saw him yesterday and he told me that he would visit us next week.'],
 'token': ['I',
  'saw',
  'him',
  'yesterday',
  'and',
  'he',
  'told',
  'me',
  'that',
  'he',
  'would',
  'visit',
  'us',
  'next',
  'week',
  '.']}

In [0]:
result = pipeline.fullAnnotate('I saw him yesterday and he told me that he would visit us next week.')

In [63]:
result

[{'date': [Annotation(date, 58, 66, 2020/04/27, {'sentence': '0'}),
   Annotation(date, 10, 18, 2020/04/19, {'sentence': '0'})],
  'document': [Annotation(document, 0, 67, I saw him yesterday and he told me that he would visit us next week., {})],
  'sentence': [Annotation(document, 0, 67, I saw him yesterday and he told me that he would visit us next week., {'sentence': '0'})],
  'token': [Annotation(token, 0, 0, I, {'sentence': '0'}),
   Annotation(token, 2, 4, saw, {'sentence': '0'}),
   Annotation(token, 6, 8, him, {'sentence': '0'}),
   Annotation(token, 10, 18, yesterday, {'sentence': '0'}),
   Annotation(token, 20, 22, and, {'sentence': '0'}),
   Annotation(token, 24, 25, he, {'sentence': '0'}),
   Annotation(token, 27, 30, told, {'sentence': '0'}),
   Annotation(token, 32, 33, me, {'sentence': '0'}),
   Annotation(token, 35, 38, that, {'sentence': '0'}),
   Annotation(token, 40, 41, he, {'sentence': '0'}),
   Annotation(token, 43, 47, would, {'sentence': '0'}),
   Annotation(to

## Sentiment Analysis

In [64]:
pipeline =PretrainedPipeline('analyze_sentiment', 'en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [0]:
result = pipeline.annotate('The movie I watched was not a good one')

In [66]:
result

{'checked': ['The', 'movie', 'I', 'watched', 'was', 'not', 'a', 'good', 'one'],
 'document': ['The movie I watched was not a good one'],
 'sentence': ['The movie I watched was not a good one'],
 'sentiment': ['negative'],
 'token': ['The', 'movie', 'I', 'watched', 'was', 'not', 'a', 'good', 'one']}

In [67]:
result['sentiment']

['negative']