<a href="https://colab.research.google.com/github/rtajeong/AI_Cluster/blob/main/lab50_text_tokenizer_Eng_Korean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Text Classification: Survey
- https://github.com/kk7nc/Text_Classification

## 1. Feature extraction
- text cleaning an dpreprocessing
- tokenization
- stop words
- noise removal
- spelling correction
- stemming
- lemmatization

In [None]:
!pip install nltk



In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize

text = "After sleeping for four hours, he decided to sleep for another four"
tokens = word_tokenize(text)
print(tokens)

['After', 'sleeping', 'for', 'four', 'hours', ',', 'he', 'decided', 'to', 'sleep', 'for', 'another', 'four']


In [None]:
# stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [None]:
# captitalization
text = "The United States of America (USA) or America, is a federal republic composed of 50 states"
print(text)
print(text.lower())

The United States of America (USA) or America, is a federal republic composed of 50 states
the united states of america (usa) or america, is a federal republic composed of 50 states


In [None]:
# noise removal
import re
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
    text = text.rstrip()
    return text.lower()

text_cleaner('  The   United States of America is a \
              federal republic composed of 50 states.  ')

'the united states of america is a federal republic composed of 50 states.'

In [None]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.5.0.tar.gz (622 kB)
[K     |████████████████████████████████| 622 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.5.0-py3-none-any.whl size=621851 sha256=4d6a858a46a6ec802ce89814f3d51b0eabb1f563841485bf90311c9cb417636f
  Stored in directory: /root/.cache/pip/wheels/3d/8e/bd/f6fd900a056a031bf710a00bca338d86f43b83f0c25ab5242f
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.5.0


In [None]:
# spelling correction
from autocorrect import spell
spell('caaaar'), spell('mussage'), spell('survice'), spell('hte')

autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead


('aaaaaa', 'message', 'service', 'the')

In [None]:
# stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly", "studying", "taxies"]
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli
studi
taxi


In [None]:
# lemmatization: eliminating redundant prefix or suffix of a word and extract the base word (lemma)
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
cat


# 한글

In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 121 kB/s 
[?25hCollecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.6 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 52.7 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [None]:
from konlpy.tag import Okt
okt=Okt()

print(okt.morphs("우리가 이 과제를 잘 할 수 있을까?"))
print(okt.pos("우리가 이 과제를 잘 할 수 있을까?", norm=True, stem=True))
print(okt.nouns("우리가 이 과제를 잘 할 수 있을까?"))

['우리', '가', '이', '과제', '를', '잘', '할', '수', '있을까', '?']
[('우리', 'Noun'), ('가', 'Josa'), ('이', 'Noun'), ('과제', 'Noun'), ('를', 'Josa'), ('자다', 'Verb'), ('하다', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('?', 'Punctuation')]
['우리', '이', '과제', '수']


- 1) morphs : 형태소 추출
- 2) pos : 품사 태깅(Part-of-speech tagging)
- 3) nouns : 명사 추출

In [None]:
from konlpy.tag import Okt
okt=Okt()
word_tags = okt.pos("우리가 이 과제를 잘 할 수 있을까?", norm=True, stem=True)
print(word_tags)
stop_words = [word[0] for word in word_tags if word[1]=="Josa"]
print (stop_words)

[('우리', 'Noun'), ('가', 'Josa'), ('이', 'Noun'), ('과제', 'Noun'), ('를', 'Josa'), ('자다', 'Verb'), ('하다', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('?', 'Punctuation')]
['가', '를']


In [None]:
from konlpy.tag import Kkma
kkma=Kkma()
print(kkma.morphs("우리가 이 과제를 잘 할 수 있을까?"))
print(kkma.pos("우리가 이 과제를 잘 할 수 있을까?"))
print(kkma.nouns("우리가 이 과제를 잘 할 수 있을까?"))

['우리', '가', '이', '과제', '를', '잘', '하', 'ㄹ', '수', '있', '을까', '?']
[('우리', 'NP'), ('가', 'JKS'), ('이', 'MDT'), ('과제', 'NNG'), ('를', 'JKO'), ('잘', 'MAG'), ('하', 'VV'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VV'), ('을까', 'EFQ'), ('?', 'SF')]
['우리', '과제', '수']
