
# Text Classification: Survey
- https://github.com/kk7nc/Text_Classification

## 1. Feature extraction
- text cleaning and preprocessing
- tokenization
- stop words
- noise removal
- spelling correction
- stemming
- lemmatization

In [1]:
!pip install nltk



In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
# tokenization
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
from nltk.tokenize import word_tokenize

text = "After sleeping for four hours, he decided to sleep for another four"
tokens = word_tokenize(text)
print(tokens)

['After', 'sleeping', 'for', 'four', 'hours', ',', 'he', 'decided', 'to', 'sleep', 'for', 'another', 'four']


In [5]:
# stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if w not in stop_words]
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [7]:
len(stop_words)

198

In [8]:
# captitalization
text = "The United States of America (USA) or America, is a federal republic composed of 50 states"
print(text)
print(text.lower())

The United States of America (USA) or America, is a federal republic composed of 50 states
the united states of america (usa) or america, is a federal republic composed of 50 states


In [9]:
# noise removal
import re
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove one or more spaces(\s+) after a tag closes
                         # u: unicode (in Python3.0 all strings are unicode by default
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
    text = text.rstrip()
    return text.lower()

text_cleaner('  The   United States of America is a \
              federal republic composed of 50 states.  ')

'the united states of america is a federal republic composed of 50 states.'

In [10]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/622.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m553.0/622.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622364 sha256=204d6d2f86484332827c2777892f4ff34ee8bf07e9714716bd781c8219318d38
  Stored in directory: /root/.cache/pip/wheels/b6/28/c2/9ddf8f57f871b55b6fd0ab99c887531fb9a66e5ff236b82aee
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2

In [11]:
# spelling correction
from autocorrect import spell
spell('caar'), spell('mussage'), spell('survice'), spell('hte')

autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead


('car', 'message', 'service', 'the')

In [12]:
# stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly", "studying", "taxies", "unhappiness"]
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli
studi
taxi
unhappi


In [13]:
# lemmatization: eliminating redundant prefix or suffix of a word and extract the base word (lemma)
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
example_words = ["cats", "unhappiness", "studying", "studied", "studies"]

for w in example_words:
    print(lemmatizer.lemmatize(w))

[nltk_data] Downloading package wordnet to /root/nltk_data...


cat
unhappiness
studying
studied
study


# 한글

In [14]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.9/495.9 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.6.0 konlpy-0.6.0


In [15]:
from konlpy.tag import Okt
okt=Okt()

print(okt.morphs("우리가 이 과제를 잘 할 수 있을까?"))
print(okt.pos("우리가 이 과제를 잘 할 수 있을까?", norm=True, stem=True))
print(okt.nouns("우리가 이 과제를 잘 할 수 있을까?"))

['우리', '가', '이', '과제', '를', '잘', '할', '수', '있을까', '?']
[('우리', 'Noun'), ('가', 'Josa'), ('이', 'Noun'), ('과제', 'Noun'), ('를', 'Josa'), ('자다', 'Verb'), ('하다', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('?', 'Punctuation')]
['우리', '이', '과제', '수']


- 1) morphs : 형태소 추출
- 2) pos : 품사 태깅(Part-of-speech tagging)
- 3) nouns : 명사 추출

In [16]:
from konlpy.tag import Okt
okt=Okt()
word_tags = okt.pos("우리가 이 과제를 잘 할 수 있을까?", norm=True, stem=True)
print(word_tags)
stop_words = [word[0] for word in word_tags if word[1]=="Josa"]
print (stop_words)

[('우리', 'Noun'), ('가', 'Josa'), ('이', 'Noun'), ('과제', 'Noun'), ('를', 'Josa'), ('자다', 'Verb'), ('하다', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('?', 'Punctuation')]
['가', '를']


In [17]:
from konlpy.tag import Kkma
kkma=Kkma()
print(kkma.morphs("우리가 이 과제를 잘 할 수 있을까?"))
print(kkma.pos("우리가 이 과제를 잘 할 수 있을까?"))
print(kkma.nouns("우리가 이 과제를 잘 할 수 있을까?"))

['우리', '가', '이', '과제', '를', '잘', '하', 'ㄹ', '수', '있', '을까', '?']
[('우리', 'NP'), ('가', 'JKS'), ('이', 'MDT'), ('과제', 'NNG'), ('를', 'JKO'), ('잘', 'MAG'), ('하', 'VV'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VV'), ('을까', 'EFQ'), ('?', 'SF')]
['우리', '과제', '수']


# Subword Tokenizer

In [18]:
# transformers 라이브러리 설치
!pip install transformers



In [19]:
# transformers 라이브러리 설치
!pip install transformers

from transformers import BertTokenizer

# 'bert-base-uncased' 모델이 사용한 WordPiece 토크나이저를 로드한다.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 일반 단어(tokenization)와 OOV 단어(huggingfaceization)가 포함된 예시 문장
text = "Modern NLP uses subword tokenization. Let's try huggingfaceization."

# 토큰화 실행
tokens = tokenizer.tokenize(text)
print(tokens)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['modern', 'nl', '##p', 'uses', 'sub', '##word', 'token', '##ization', '.', 'let', "'", 's', 'try', 'hugging', '##face', '##ization', '.']
