# 텍스트 정제하기

In [1]:
text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

In [2]:
# 공백 문자 제거
strip_whitespace = [string.strip() for string in text_data]
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [4]:
# 마침표 제거
remove_periods = [string.replace('.', '') for string in strip_whitespace]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [5]:
# 사용자 정의 함수
def capitalizer(string:str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [6]:
import re

def replace_letters_with_X(string : str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

# HTML 파싱과 정제

In [7]:
from bs4 import BeautifulSoup

In [8]:
# 예제 HTML 코드를 만듭니다.
html = """
       <div class='full_name'><span style='font-weight:bold'>
       Masego</span> Azra</div>"
       """

In [9]:
soup = BeautifulSoup(html, 'lxml')

In [10]:
soup.find('div', {'class':'full_name'}).text

'\n       Masego Azra'

# 구두점 삭제

In [16]:
import unicodedata
import sys

In [17]:
text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

In [18]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
             if unicodedata.category(chr(i)).startswith('P'))

In [19]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

# 텍스트 토큰화

In [20]:
import nltk



In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/restful3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
from nltk.tokenize import word_tokenize

In [23]:
string = "The science of today is the technology of tomorrow"

In [24]:
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [25]:
from nltk.tokenize import sent_tokenize

In [26]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."

In [27]:
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

# 불용어 삭제

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/restful3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.corpus import stopwords

In [30]:
# 단어 토큰을 만듭니다.
tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park']

In [31]:
stop_words = stopwords.words('english')

In [32]:
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

In [33]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [34]:
len(stop_words)

179

In [35]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [36]:
len(ENGLISH_STOP_WORDS)

318

In [38]:
list(ENGLISH_STOP_WORDS)[:5]

['fire', 'describe', 'only', 'whereafter', 'whoever']

# 어간 추출

In [39]:
from nltk.stem.porter import PorterStemmer

In [40]:
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

In [41]:
porter = PorterStemmer()

In [42]:
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

# 품사 태깅

In [43]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/restful3/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [44]:
from nltk import pos_tag
from nltk import word_tokenize

In [45]:
text_data = 'Chris loved outdoor running'

In [46]:
text_tagged = pos_tag(word_tokenize(text_data))

In [47]:
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [48]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

['Chris']

In [49]:
from sklearn.preprocessing import MultiLabelBinarizer

In [50]:
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]


In [58]:
tagged_tweets = []
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
tagged_tweets    

[['PRP', 'VBP', 'VBG', 'DT', 'NN', 'IN', 'NN'],
 ['JJ', 'NN', 'VBZ', 'DT', 'JJ', 'NN'],
 ['NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN']]

In [59]:
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [60]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [61]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/restful3/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [64]:
from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

In [66]:
sentences = brown.tagged_sents(categories='news')
sentences

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [67]:
train = sentences[:4000]
test = sentences[4000:]

In [69]:
unigram = UnigramTagger(train)
bigram = BigramTagger(train)
trigram = TrigramTagger(train)

In [71]:
unigram.evaluate(test)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram.evaluate(test)


0.8111044507717668

In [72]:
bigram.evaluate(test)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram.evaluate(test)


0.10205304960287727

In [73]:
trigram.evaluate(test)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  trigram.evaluate(test)


0.06451371197362506

In [76]:
# !pip install konlpy

In [77]:
from konlpy.tag import Okt

In [78]:
okt = Okt()

In [79]:
text = '태양계는 지금으로부터 약 46억 년 전, 거대한 분자 구름의 일부분이 중력 붕괴를 일으키면서 형성되었다'

In [80]:
okt.pos(text)

[('태양계', 'Noun'),
 ('는', 'Josa'),
 ('지금', 'Noun'),
 ('으로부터', 'Josa'),
 ('약', 'Noun'),
 ('46억', 'Number'),
 ('년', 'Noun'),
 ('전', 'Noun'),
 (',', 'Punctuation'),
 ('거대한', 'Adjective'),
 ('분자', 'Noun'),
 ('구름', 'Noun'),
 ('의', 'Josa'),
 ('일부분', 'Noun'),
 ('이', 'Josa'),
 ('중력', 'Noun'),
 ('붕괴', 'Noun'),
 ('를', 'Josa'),
 ('일으키면서', 'Verb'),
 ('형성', 'Noun'),
 ('되었다', 'Verb')]

In [81]:
okt.morphs(text)

['태양계',
 '는',
 '지금',
 '으로부터',
 '약',
 '46억',
 '년',
 '전',
 ',',
 '거대한',
 '분자',
 '구름',
 '의',
 '일부분',
 '이',
 '중력',
 '붕괴',
 '를',
 '일으키면서',
 '형성',
 '되었다']

# 텍스트를 BoW로 인코딩

In [82]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# 텍스트를 만듭니다.
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

In [83]:
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [85]:
bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [86]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [87]:
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [88]:
count_2gram = CountVectorizer(ngram_range=(1, 2), stop_words='english', vocabulary=['brazil'])

In [89]:
bag = count_2gram.fit_transform(text_data)

In [90]:
bag.toarray()

array([[2],
       [0],
       [0]])

In [91]:
count_2gram.vocabulary_

{'brazil': 0}

In [92]:
count_2gram = CountVectorizer(ngram_range=(1, 2), stop_words='english')
bag = count_2gram.fit_transform(text_data)
bag.toarray()

array([[0, 0, 2, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [1, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

In [93]:
count_2gram.get_feature_names_out()

array(['beats', 'best', 'brazil', 'brazil brazil', 'germany',
       'germany beats', 'love', 'love brazil', 'sweden', 'sweden best'],
      dtype=object)

In [94]:
count_2gram.vocabulary_

{'love': 6,
 'brazil': 2,
 'love brazil': 7,
 'brazil brazil': 3,
 'sweden': 8,
 'best': 1,
 'sweden best': 9,
 'germany': 4,
 'beats': 0,
 'germany beats': 5}

# 단어 중요도에 가중치 부여

In [95]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [96]:
# 텍스트를 만듭니다.
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

In [97]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [98]:
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [99]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [100]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}