<a href="https://colab.research.google.com/github/roi-mason/nlp-basics/blob/main/nlp_basics_004.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 텍스트에서 피처 **추출(1)**
* pp. 87~91
* 머신러닝에서는 수치형 feature만을 입력으로 사용
* 텍스트를 다룰 때는
- 일반적인 feature
  - 텍스트 언어
  - 총 단어 개수
- 특수 feature
  - BoW
  - TF-IDF

In [8]:
# 원시 텍스트에서 일반적인 feature 추출
# 예제 22 (pp. 88~91)

import pandas as pd
df = pd.DataFrame([["A Man and his Wife had the good fortune to possess a Goose which laid a Golden Egg every day."],
["Lucky though they were, they soon began to think they were not getting rich fast enough, and, imagining the bird must be made of gold inside, they decided to kill it in order to secure the whole store of precious metal at once."],
["But when they cut it open they found it was just like any other goose."],
["Thus, they neither got rich all at once, as they had hoped, nor enjoyed any longer the daily addition to their wealth."],
["Much wants more and loses all."]])

df.columns = ['text']
print(df)



                                                text
0  A Man and his Wife had the good fortune to pos...
1  Lucky though they were, they soon began to thi...
2  But when they cut it open they found it was ju...
3  Thus, they neither got rich all at once, as th...
4                     Much wants more and loses all.


In [11]:
# 텍스트의 단어 개수 추출
# apply 함수 사용 (apply()는 pandas DataFrame을 사용해야 가능)
# TextBlob 객체
# lambda 함수 사용
# **lambda 함수**
# lambda 매개변수(인자) : 표현식
# lambda argument: expression

import nltk
nltk.download('punkt')
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(str(x)).words))
df['number_of_words']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


0    20
1    43
2    15
3    22
4     6
Name: number_of_words, dtype: int64

In [14]:
# 특정 단어 존재 확인
# 'wh-' 로 시작하는 단어 확인
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['is_wh_words_present'] = df['text'].apply(lambda x: True if len(set(TextBlob(str(x)).words).intersection(wh_words)) > 0 else False)
df['is_wh_words_present']

0     True
1    False
2     True
3    False
4    False
Name: is_wh_words_present, dtype: bool

In [15]:
# 텍스트 문장의 감성 점수 부여

df['polarity'] = df['text'].apply(lambda x : TextBlob(str(x)).sentiment.polarity)
df['polarity']

0    0.500000
1    0.286905
2   -0.062500
3    0.291667
4    0.133333
Name: polarity, dtype: float64

In [19]:
# 언어 구분
# TextBlob의 detect_language()는 더 이상 유효하지 않음
# langdetect로 교체해서 사용

!pip install langdetect
import langdetect

df['language'] = df['text'].apply(lambda x: langdetect.detect(x))
df['language']

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=8b1f8f5c7c6a5c8d0cd84ba6f76de823c15c6166fc3f63862c1698b070941878
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


0    en
1    en
2    en
3    en
4    en
Name: language, dtype: object