## nltk와 spacy의 ner을 한번 비교 해보고 apis로 띄워보자

### nltk ner

In [15]:
import nltk

In [2]:
nltk.download("all", quiet=True)

True

In [16]:
from nltk import word_tokenize, pos_tag, ne_chunk
sentence = "Jim bought 300 shares of Acme Corp. in 2006."
sentence = pos_tag(word_tokenize(sentence))
print(sentence)

[('Jim', 'NNP'), ('bought', 'VBD'), ('300', 'CD'), ('shares', 'NNS'), ('of', 'IN'), ('Acme', 'NNP'), ('Corp.', 'NNP'), ('in', 'IN'), ('2006', 'CD'), ('.', '.')]


In [60]:
chunk_sentence = ne_chunk(sentence)
print(chunk_sentence)

(S
  (PERSON Jim/NNP)
  bought/VBD
  300/CD
  shares/NNS
  of/IN
  (ORGANIZATION Acme/NNP Corp./NNP)
  in/IN
  2006/CD
  ./.)


### spacy ner

In [74]:
import spacy
from collections import Counter
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
spacy.prefer_gpu()

True

In [102]:
doc = nlp("Jim bought 300 shares of Acme Corp. in 2006.")
result = []
for entity in doc.ents:
    result.append([entity.text, entity.label_])
print(result)


[['Jim', 'PERSON'], ['300', 'CARDINAL'], ['Acme Corp.', 'ORG'], ['2006', 'DATE']]


### spacy ner을 fast api로 띄우기

In [156]:
import requests

response = requests.post(
    url = 'http://127.0.0.1:8000/kor_ner',
    json = {
        "user_input": "Jim bought 300 shares of Acme Corp. in 2006."
    }
)
print(response.json())

{'result': {'inputs': 'Jim bought 300 shares of Acme Corp. in 2006.', 'ner': {'input_ids': {}, 'token_type_ids': {}, 'attention_mask': {}}}, 'error': None}


In [172]:
import requests

response = requests.post(
    url = 'http://127.0.0.1:8000/kor_ner',
    json = {
        "text": "손흥민이 콜로세움에 갔다"
    }
)
print(response.json())

{'result': {'input_ids': {}, 'token_type_ids': {}, 'attention_mask': {}}}


In [145]:
import torch
from transformers import (
    AutoTokenizer,
    ElectraForTokenClassification,
    BertForSequenceClassification
)

In [143]:
NAME = 'sgunderscore/hatescore-korean-hate-speech'

In [146]:
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = BertForSequenceClassification.from_pretrained(NAME)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [147]:
sent = "손흥민이 나랑 축구를 하고 싶다고 해서 서울로 왔다"

In [151]:
sent = "아 시발 개 족같은 련아"

In [183]:
tokenized_sent = tokenizer(sent, truncation = True, return_tensors="pt")
print(tokenized_sent[0].tokens[1:-1])
print(tokenized_sent)
result = model(**tokenized_sent).logits
print(result)
result = torch.argmax(result.cpu().detach(), axis = -1)
print(result)
for token, ner in zip(tokenized_sent[0].tokens, result):
    print(token, ner.numpy())

['아', '시발', '개', '족', '##같은', '련', '##아']
{'input_ids': tensor([[    2,  2170, 13552,   220,  2573,  8036,  1184,  4085,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


AttributeError: 'Tensor' object has no attribute 'logits'

In [177]:
import requests

url = "http://localhost:8000/analyze_sentiment"
data = {"text": "손흥민이 축구를 한다."}
response = requests.post(url, json=data)

if response.ok:
    result = response.json()
    print("Sentiment: ", result["sentiment"])
else:
    print("Error:", response.status_code, response.text)

Sentiment:  {}
