<a href="https://colab.research.google.com/github/seawavve/PeekABook/blob/main/NER/Spark_NER_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 환경설정

In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-07-08 10:56:58--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-07-08 10:56:58--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1608 (1.6K) [text/plain]
Saving to: ‘STDOUT’

-                     0%[                    ]       0  --.-KB/s               setup Colab for PySpark 3.0.3 and Spark NLP 3.1.2

2021-07-08 10:56:59 (1.94 

In [2]:
import json
import os
from pyspark.ml import Pipeline
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp

spark = sparknlp.start()

def get_ann_pipeline ():
    
    document_assembler = DocumentAssembler() \
        .setInputCol("text")\
        .setOutputCol('document')

    sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')\
        .setCustomBounds(['\n'])

    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")

    pos = PerceptronModel.pretrained() \
              .setInputCols(["sentence", "token"]) \
              .setOutputCol("pos")
    
    embeddings = WordEmbeddingsModel.pretrained()\
          .setInputCols(["sentence", "token"])\
          .setOutputCol("embeddings")

    ner_model = NerDLModel.pretrained() \
          .setInputCols(["sentence", "token", "embeddings"]) \
          .setOutputCol("ner")

    ner_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")

    ner_pipeline = Pipeline(
        stages = [
            document_assembler,
            sentence,
            tokenizer,
            pos,
            embeddings,
            ner_model,
            ner_converter
        ]
    )

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    ner_pipelineFit = ner_pipeline.fit(empty_data)

    ner_lp_pipeline = LightPipeline(ner_pipelineFit)

    print ("Spark NLP NER lightpipeline is created")

    return ner_lp_pipeline


In [3]:
conll_pipeline = get_ann_pipeline ()

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]
Spark NLP NER lightpipeline is created


# 테스트

In [4]:
parsed = conll_pipeline.annotate ("Peter Parker Baker is in a baby blue Cadillac.")

for key in parsed.keys():
    print(key,': ',parsed[key])
    
#parsed

document :  ['Peter Parker Baker is in a baby blue Cadillac.']
ner_chunk :  ['Peter Parker Baker', 'Cadillac']
pos :  ['NNP', 'NNP', 'NNP', 'VBZ', 'IN', 'DT', 'NN', 'JJ', 'NNP', '.']
token :  ['Peter', 'Parker', 'Baker', 'is', 'in', 'a', 'baby', 'blue', 'Cadillac', '.']
ner :  ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']
embeddings :  ['Peter', 'Parker', 'Baker', 'is', 'in', 'a', 'baby', 'blue', 'Cadillac', '.']
sentence :  ['Peter Parker Baker is in a baby blue Cadillac.']


In [5]:
conll_lines=''

for token, pos, ner in zip(parsed['token'],parsed['pos'],parsed['ner']):

    conll_lines += "{} {} {} {}\n".format(token, pos, pos, ner)


print(conll_lines)

Peter NNP NNP B-PER
Parker NNP NNP I-PER
Baker NNP NNP I-PER
is VBZ VBZ O
in IN IN O
a DT DT O
baby NN NN O
blue JJ JJ O
Cadillac NNP NNP B-ORG
. . . O



In [6]:
sentences=['Peter Parker Baker is in a baby blue Cadillac.','I love you.','Hanhee fell a sleep']

def get_person_entity(sentences):
  data=[]
  for sentence in sentences:
    parsed = conll_pipeline.annotate (sentence)
    line_entity=[]

    if 'B-PER' in parsed['ner']:
      for i in range(len(parsed['ner'])):
        if parsed['ner'][i]=='B-PER':
          name=parsed['embeddings'][i]
          line_entity.append(name)
        elif parsed['ner'][i]=='I-PER':
          line_entity[-1]+=' '+parsed['embeddings'][i]
      data.append(line_entity)
    else:
      data.append(line_entity)
      
  return data

print(get_person_entity(sentences))

[['Peter Parker Baker'], [], ['Hanhee']]


In [9]:
import pandas as pd
df=pd.read_csv('Predicted_result.csv')
df['person']=get_person_entity(df['sentence'])
display(df)
df.to_csv('Entity_Predicted_result.csv')

Unnamed: 0.1,Unnamed: 0,sentence,mark,predicted_mark,person
0,0,b' He ended by imploring Zobeida not to confou...,3,0,[Zobeida]
1,1,"b' Is this, continued Zobeida, growing more an...",2,2,[Zobeida]
2,2,"b' But I am very angry with your brothers, and...",2,2,[]
3,3,"b' Both I and my subjects esteem you, and wish...",0,1,[]
4,4,b' The Sultan was so delighted to hear these w...,1,1,[Sultan]
...,...,...,...,...,...
167,167,b' He was delighted.',1,1,[]
168,168,"b' But no, it will be better if I throw you in...",2,2,[]
169,169,"b' To his great surprise, he heard her saying ...",1,1,[Rejoice]
170,170,b' When he arrived home his wife and children ...,1,1,[]
