# 자연어 처리 NLP(Natural Language Processing) | 텍스트 분석 Text Analysis

- 자연어 처리: 사람이 사용하는 언어 전반에 대해서 이해하고 처리하는 분야
    - 음성인식, 번역, 감정분석, 요약, 질의응답, 언어생성 등 포괄적 분야
- 텍스트 분석: 언어적 비정형 데이터에서 정보를 추출하고 분석하는 작업
    - 텍스트 통계적 분석, 주제 분류, 텍스트 군집, 유사도 분석 등

In [1]:
# nltk 설치
# !conda install nltk -y

3 channel Terms of Service accepted
Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.





    current version: 25.5.1
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c defaults conda




In [2]:
import nltk
nltk.__version__

'3.9.1'

In [3]:
# nltk 리소스 다운로드
nltk.download('punkt')          # 토큰화에 필요한 데이터
nltk.download('punkt_tab')      
nltk.download('stopwords')      # 불용어 리스트

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "NLTK is a powerful library for NLP!!!"

word_tokenize(text)     # 단어 토큰화

['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '!', '!', '!']

In [5]:
text = '''The Matrix is everywhere its all around us, here even in this room.
You can see it out your window or on your television.
You feel it when you go to work, or go to church or pay your taxes.'''

sent_tokenize(text)

['The Matrix is everywhere its all around us, here even in this room.',
 'You can see it out your window or on your television.',
 'You feel it when you go to work, or go to church or pay your taxes.']

In [6]:
# 문장별 단어 토큰화
for sentence in sent_tokenize(text):
    print(word_tokenize(sentence))

def tokenize_text(text):
    sentences = sent_tokenize(text)
    return [word_tokenize(sentence) for sentence in sentences]

tokenize_text(text)

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']
['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.']
['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']


[['The',
  'Matrix',
  'is',
  'everywhere',
  'its',
  'all',
  'around',
  'us',
  ',',
  'here',
  'even',
  'in',
  'this',
  'room',
  '.'],
 ['You',
  'can',
  'see',
  'it',
  'out',
  'your',
  'window',
  'or',
  'on',
  'your',
  'television',
  '.'],
 ['You',
  'feel',
  'it',
  'when',
  'you',
  'go',
  'to',
  'work',
  ',',
  'or',
  'go',
  'to',
  'church',
  'or',
  'pay',
  'your',
  'taxes',
  '.']]

In [7]:
# n-gram
from nltk import ngrams

text = "The Matrix is everywhere its all around us, here even in this room."

tokens = word_tokenize(text)
print(tokens)

bigram = ngrams(tokens, 2)
print([token for token in bigram])

trigram = ngrams(tokens,3)
print([token for token in trigram])

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']
[('The', 'Matrix'), ('Matrix', 'is'), ('is', 'everywhere'), ('everywhere', 'its'), ('its', 'all'), ('all', 'around'), ('around', 'us'), ('us', ','), (',', 'here'), ('here', 'even'), ('even', 'in'), ('in', 'this'), ('this', 'room'), ('room', '.')]
[('The', 'Matrix', 'is'), ('Matrix', 'is', 'everywhere'), ('is', 'everywhere', 'its'), ('everywhere', 'its', 'all'), ('its', 'all', 'around'), ('all', 'around', 'us'), ('around', 'us', ','), ('us', ',', 'here'), (',', 'here', 'even'), ('here', 'even', 'in'), ('even', 'in', 'this'), ('in', 'this', 'room'), ('this', 'room', '.')]


In [8]:
from nltk.corpus import stopwords

stopwords.fileids()     # 지원되는 언어 목록

stopwords.words('english')      # 영어 불용어 목록

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [9]:
text = "The Matrix is everywhere its all around us, here even in this room."
stopwords_list = stopwords.words('english')

tokens = []
for word in word_tokenize(text):        # 토큰화
    word = word.lower()                 # 소문자 변환
    if word not in stopwords_list:      # 불용어 처리
        tokens.append(word)
tokens
    

['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.']

### 특성 벡터화 Feature Vectorization
- BOW > CountVectorizer



In [10]:
text1 = 'The Matrix is everywhere its all around us, here even in this room. \
You can see it out your window or on your television. \
You feel it when you go to work, or go to church or pay your taxes.'

text2 = 'You take the blue pill and the story ends.  You wake in your bed and you believe whatever you want to believe \
You take the red pill and you stay in Wonderland and I show you how deep the rabbit-hole goes.'


texts = [text1, text2]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vectorizer.fit(texts)             # 고유한 단어(토큰)을 추출하여 단어 사전 생성
text_vecs = count_vectorizer.transform(texts)       # vector로 바꿈

print(type(text_vecs),type(text_vecs.toarray()))
text_vecs.toarray()             # 빈도수로 나옴. sparse 행렬 csr_matrix : 0이 너무 많으니까 0 아닌 애들만 저장

<class 'scipy.sparse._csr.csr_matrix'> <class 'numpy.ndarray'>


array([[1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 2, 0, 1, 0, 0, 1, 1, 2, 1,
        1, 1, 3, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 0, 0,
        0, 1, 1, 0, 1, 3, 3],
       [0, 4, 0, 1, 2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 0,
        0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 2, 0, 0, 4, 0, 1, 0, 1, 1,
        1, 0, 0, 1, 0, 7, 1]])

In [12]:
print(count_vectorizer.get_feature_names_out())
print(count_vectorizer.vocabulary_)     # dict 형태. 단어 : 숫자 -> 숫자는 인덱스를 의미

['all' 'and' 'around' 'bed' 'believe' 'blue' 'can' 'church' 'deep' 'ends'
 'even' 'everywhere' 'feel' 'go' 'goes' 'here' 'hole' 'how' 'in' 'is' 'it'
 'its' 'matrix' 'on' 'or' 'out' 'pay' 'pill' 'rabbit' 'red' 'room' 'see'
 'show' 'stay' 'story' 'take' 'taxes' 'television' 'the' 'this' 'to' 'us'
 'wake' 'want' 'whatever' 'when' 'window' 'wonderland' 'work' 'you' 'your']
{'the': 38, 'matrix': 22, 'is': 19, 'everywhere': 11, 'its': 21, 'all': 0, 'around': 2, 'us': 41, 'here': 15, 'even': 10, 'in': 18, 'this': 39, 'room': 30, 'you': 49, 'can': 6, 'see': 31, 'it': 20, 'out': 25, 'your': 50, 'window': 46, 'or': 24, 'on': 23, 'television': 37, 'feel': 12, 'when': 45, 'go': 13, 'to': 40, 'work': 48, 'church': 7, 'pay': 26, 'taxes': 36, 'take': 35, 'blue': 5, 'pill': 27, 'and': 1, 'story': 34, 'ends': 9, 'wake': 42, 'bed': 3, 'believe': 4, 'whatever': 44, 'want': 43, 'red': 29, 'stay': 33, 'wonderland': 47, 'show': 32, 'how': 17, 'deep': 8, 'rabbit': 28, 'hole': 16, 'goes': 14}


In [13]:
import pandas as pd

vocab = sorted(count_vectorizer.vocabulary_.items(),key=lambda x : x[1])
vocab_df = pd.DataFrame(vocab, columns=['word','idx'])
vocab_df

Unnamed: 0,word,idx
0,all,0
1,and,1
2,around,2
3,bed,3
4,believe,4
5,blue,5
6,can,6
7,church,7
8,deep,8
9,ends,9


In [14]:
word_counts = text_vecs.toarray().sum(axis=0)

vocab_df['count'] = vocab_df['idx'].apply(lambda i: word_counts[i])

vocab_df = vocab_df.drop(columns=['idx'])

vocab_df

Unnamed: 0,word,count
0,all,1
1,and,4
2,around,1
3,bed,1
4,believe,2
5,blue,1
6,can,1
7,church,1
8,deep,1
9,ends,1


In [15]:
count_vectorizer = CountVectorizer(stop_words='english')
text_vecs = count_vectorizer.fit_transform(texts)
print(text_vecs.toarray().shape)

vocab = sorted(count_vectorizer.vocabulary_.items(),key=lambda x : x[1])
vocab_df = pd.DataFrame(vocab, columns=['word','idx'])
vocab_df

(2, 24)


Unnamed: 0,word,idx
0,bed,0
1,believe,1
2,blue,2
3,church,3
4,deep,4
5,ends,5
6,feel,6
7,goes,7
8,hole,8
9,matrix,9


In [16]:
count_vectorizer = CountVectorizer(
    stop_words='english', 
    ngram_range=(1, 2),     # 최소값, 최대값 : 범위에 대한 지정 
    max_features=20
)
texts_vecs = count_vectorizer.fit_transform(texts)

print(texts_vecs.toarray().shape)
count_vectorizer.get_feature_names_out()

(2, 20)


array(['bed', 'bed believe', 'believe', 'believe red', 'believe want',
       'blue', 'blue pill', 'church', 'church pay', 'deep', 'deep rabbit',
       'ends', 'ends wake', 'feel', 'feel work', 'goes', 'hole',
       'hole goes', 'matrix', 'pill'], dtype=object)

- BOW > TfldfVectorizer

- BOW > TfIdVectorizer
    - TF-IDF == Term Frequency-Inverse Document Frequency 

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
texts_vecs = tfidf_vectorizer.fit_transform(texts)

print(texts_vecs.toarray())
tfidf_vectorizer.get_feature_names_out()

[[0.         0.         0.         0.33333333 0.         0.
  0.33333333 0.         0.         0.33333333 0.33333333 0.
  0.         0.         0.33333333 0.         0.         0.33333333
  0.33333333 0.         0.         0.33333333 0.         0.33333333]
 [0.21821789 0.43643578 0.21821789 0.         0.21821789 0.21821789
  0.         0.21821789 0.21821789 0.         0.         0.43643578
  0.21821789 0.21821789 0.         0.21821789 0.21821789 0.
  0.         0.21821789 0.21821789 0.         0.21821789 0.        ]]


array(['bed', 'believe', 'blue', 'church', 'deep', 'ends', 'feel', 'goes',
       'hole', 'matrix', 'pay', 'pill', 'rabbit', 'red', 'room', 'stay',
       'story', 'taxes', 'television', 'wake', 'want', 'window',
       'wonderland', 'work'], dtype=object)