# 뉴스 &rarr; 크롤링 &rarr; NER


[Susan Li (Aug 17, 2018), "Named Entity Recognition with NLTK and SpaCy - NER is used in many fields in Natural Language Processing (NLP)"](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da)에서 구현된 웹사이트를 크롤링하여 텍스트를 가져오고 여기에서 개체명(Named Entity)를 추출하는 사례를 살펴보자.

# 뉴스기사 크롤링

`requests`를 이용하여 웹사이트를 가져오고 `bs4` 라이브러리를 활용하여 뉴스기사만 추출해낸다.

In [5]:
import requests
from bs4 import BeautifulSoup
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

print(ny_bb[:100])

     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times        


# 개체명 추출

`spacy` 라이브러리를 통해서 개체명을 추출해낸다.

In [7]:
import spacy
from spacy import displacy
import en_core_web_sm

nlp = en_core_web_sm.load()

article = nlp(ny_bb)

len(article.ents)

173

# 개체명 기초통계

추출된 개체명을 바탕으로 기초통계 작업을 수행해보자.

In [12]:
from collections import Counter

## labels
labels = [x.label_ for x in article.ents]
Counter(labels).most_common(5)

[('PERSON', 83), ('ORG', 39), ('DATE', 23), ('GPE', 16), ('CARDINAL', 5)]

In [13]:
## 토큰

items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12), ('Russia', 7), ('Clinton', 5)]

# 기사에서 문장 무작위 추출

`random` 라이브러리에서 문장을 무작위로 추출한다.

In [29]:
import random

random.seed(77111)

sentences = [x for x in article.sents]
sample_sentence = sentences[random.randrange(0, len(sentences))]
print(sample_sentence)

” Mr. Strzok, who rose over 20 years at the F.B.I. to become one of its most experienced counterintelligence agents, was a key figure in the early months of the inquiry.


## NER 시각화

In [30]:
displacy.render(nlp(str(sample_sentence)), jupyter=True, style='ent')

In [36]:
displacy.render(nlp(str(sample_sentence)), style='dep', jupyter = True, options = {'distance': 40})

## 전체 뉴스기사 시각화

In [37]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')