# Chapter 4. Text Vectorization and Transformation Pipelines

* 싸이그래머 / 싸이ML - NLP [1]
* 김무성

# Contents
* Words in Space
    - Frequency Vectors
        - With NLTK
        - In Scikit-Learn
        - The Gensim way
    - One-Hot Encoding
        - With NLTK
        - In Scikit-Learn
        - The Gensim way
    - Term Frequency-Inverse Document Frequency
        - With NLTK
        - In Scikit-Learn
        - The Gensim way
    - Distributed Representation
        - The Gensim way
* The Scikit-Learn API
    - The BaseEstimator Interface
    - Extending TransformerMixin
        - Creating a custom Gensim vectorization transformer
        - Creating a custom text normalization transformer
* Pipelines
    - Pipeline Basics
    - Grid Search for Hyperparameter Optimization
    - Enriching Feature Extraction with Feature Unions

# Words in Space
* Frequency Vectors
* One-Hot Encoding
* Term Frequency-Inverse Document Frequency
* Distributed Representation

<img src="./figures/cap01.png" width=600 />

---------------------

In [None]:
# tokenize 함수를 만들어서 corpus에 적용하기

In [1]:
import nltk
import string

In [74]:
def tokenize(text) :
    pass # <- 코딩!!

In [3]:
# The corpus object
corpus = [
    # 코딩!!
]

In [4]:
corpus[0]

'The elephant sneezed at the sight of potatoes.'

In [5]:
corpus[1]

'Bats can see via echolocation. See the bat sight sneeze!'

In [6]:
corpus[2]

'Wondering, she opened the door to the studio.'

In [14]:
for text in corpus :
    print(text)

The elephant sneezed at the sight of potatoes.
Bats can see via echolocation. See the bat sight sneeze!
Wondering, she opened the door to the studio.


In [64]:
text = corpus[0]

In [67]:
tokenize(text)

['The', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potatoes.']

-----------------------------

In [None]:
# 코퍼스 리더 클래스를 통해, 코퍼스 접근(파일 읽기)

In [None]:
# 현재 위치에서 corpus라는 이름으로 디렉토리를 만들고, 
# 1.txt 파일에 다음 문장들 저장
# "The elephant sneezed at the sight of potatoes."
# "Bats can see via echolocation. See the bat sight sneeze!"
# "Wondering, she opened the door to the studio."

In [None]:
!ls

In [72]:
class TextCorpusReader(object):

    def __init__(self, path):
        pass # <- 코딩!!
    

    def docs(self):
        pass # <- 코딩!!

In [None]:
corpus_dir_path = 
reader = TextCorpusReader(paht=corpus_dir_path) 

In [84]:
import os
lst = os.listdir('./corpus')

In [87]:
for fn in lst : 
    if 'txt' in fn :
        print(fn)

1.txt


In [94]:
corpus_dir_path = './corpus'
fn_list = # 코딩!!
fn_list

['./corpus/1.txt']

In [None]:
reader.docs

In [None]:
corpus = reader.docs

In [73]:
corpus

['The elephant sneezed at the sight of potatoes.',
 'Bats can see via echolocation. See the bat sight sneeze!',
 'Wondering, she opened the door to the studio.']

--------------------------

In [63]:
import importlib as imp

In [None]:
text = corpus[0]

In [69]:
tokenize(text)

<generator object tokenize at 0x7fd41effbe08>

---------------------------

In [None]:
# 패키지를 만들어보자. mynlp.py

In [99]:
import mynlp

In [None]:
dir(mynlp)

------------------

In [None]:
# 지금까지 만든 코퍼스 리더와 tokenize 함수를 패키지에 반영

In [None]:
import importlib as imp

In [None]:
imp.reload(mynlp)

In [None]:
dir(mynlp)

In [None]:
corpus_dir_path = './corpus'
reader = mynlp.TextCorpusReader(paht=corpus_dir_path) 

In [None]:
corpus = reader.docs
corpus

In [76]:
mynlp.tokenize(corpus[0])

['The', 'elephant', 'sneezed', 'at', 'the', 'sight', 'of', 'potatoes.']

----------------------

In [None]:
# tokenize 함수를 제너레이터를 반환하는 형태로 바꾸자.

In [None]:
imp.reload(mynlp)

In [None]:
mynlp.tokenize(corpus[0])

In [None]:
for token in mynlp.tokenize(corpus[0]) :
    print(token)

-----------------------

In [None]:
# tokenize 함수를 nltk를 이용해서 

In [None]:
imp.reload(mynlp)

In [None]:
mynlp.tokenize(corpus[0])

In [71]:
for token in mynlp.tokenize(corpus[0]) :
    print(token)

The
elephant
sneezed
at
the
sight
of
potatoes.


In [16]:
[token for token in mynlp.tokenize(corpus[0])]

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']

In [19]:
for text in corpus :
    print(text)
    print(mynlp.tokenize(text))

The elephant sneezed at the sight of potatoes.
<generator object tokenize at 0x7fd45c188b48>
Bats can see via echolocation. See the bat sight sneeze!
<generator object tokenize at 0x7fd45c188b48>
Wondering, she opened the door to the studio.
<generator object tokenize at 0x7fd45c188b48>


In [12]:
[mynlp.tokenize(text) for text in corpus]

[<generator object tokenize at 0x7fd41f1bd830>,
 <generator object tokenize at 0x7fd41f1bd780>,
 <generator object tokenize at 0x7fd41f1bd728>]

In [28]:
total_token_arr = []

for text in corpus :
    # 코딩!!
    
print(total_token_arr)

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'], ['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez'], ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]


In [31]:
total_token_arr = # 코딩!!
print(total_token_arr)

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'], ['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez'], ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]


## Frequency Vectors
* With NLTK
* In Scikit-Learn
* The Gensim way

<img src="./figures/cap02.png" width=600 />

### With NLTK

In [45]:
# The NLTK frequency vectorize method
def vectorize(doc):
    pass # <- 코딩!!

In [38]:
doc = corpus[0]
doc

'The elephant sneezed at the sight of potatoes.'

In [44]:
vectorize(doc)

-----------------

In [None]:
# 토큰 카운팅하기

In [96]:
from collections import defaultdict

In [37]:
features = defaultdict(int)
features

defaultdict(int, {})

In [None]:
for token in mynlp.tokenize(doc) :
    # 코딩!!

In [42]:
features

defaultdict(int,
            {'at': 1,
             'eleph': 1,
             'of': 1,
             'potato': 1,
             'sight': 1,
             'sneez': 1,
             'the': 2})

In [46]:
vectorize(doc)

defaultdict(int,
            {'at': 1,
             'eleph': 1,
             'of': 1,
             'potato': 1,
             'sight': 1,
             'sneez': 1,
             'the': 2})

---------------

In [None]:
# 패키지에 vectorize 함수를 반영

In [None]:
imp.reload(mynlp)

In [47]:
doc = corpus[1]
mynlp.vectorize(doc)

defaultdict(int,
            {'bat': 2,
             'can': 1,
             'echoloc': 1,
             'see': 2,
             'sight': 1,
             'sneez': 1,
             'the': 1,
             'via': 1})

---------------

In [None]:
# 코퍼스 전체에 vectorize 함수를 적용하기

In [48]:
map(mynlp.vectorize, corpus)

<map at 0x7fd41ef93518>

In [54]:
list(map(mynlp.vectorize, corpus))

[defaultdict(int,
             {'at': 1,
              'eleph': 1,
              'of': 1,
              'potato': 1,
              'sight': 1,
              'sneez': 1,
              'the': 2}),
 defaultdict(int,
             {'bat': 2,
              'can': 1,
              'echoloc': 1,
              'see': 2,
              'sight': 1,
              'sneez': 1,
              'the': 1,
              'via': 1}),
 defaultdict(int,
             {'door': 1,
              'open': 1,
              'she': 1,
              'studio': 1,
              'the': 2,
              'to': 1,
              'wonder': 1})]

In [55]:
vectors = map(mynlp.vectorize, corpus)

In [56]:
vectors

<map at 0x7fd41efa0080>

In [57]:
list(vectors)

[defaultdict(int,
             {'at': 1,
              'eleph': 1,
              'of': 1,
              'potato': 1,
              'sight': 1,
              'sneez': 1,
              'the': 2}),
 defaultdict(int,
             {'bat': 2,
              'can': 1,
              'echoloc': 1,
              'see': 2,
              'sight': 1,
              'sneez': 1,
              'the': 1,
              'via': 1}),
 defaultdict(int,
             {'door': 1,
              'open': 1,
              'she': 1,
              'studio': 1,
              'the': 2,
              'to': 1,
              'wonder': 1})]

----------

In [None]:
# 패키지에 nltk_frequency_vectorize 함수를 만들어서 코퍼스에 대해 적용

In [100]:
imp.reload(mynlp)

<module 'mynlp' from '/home/jovyan/work/nlp_ml/ch04/mynlp.py'>

In [101]:
vectors = mynlp.nltk_frequency_vectorize(corpus)

In [102]:
list(vectors)

[defaultdict(int,
             {'at': 1,
              'eleph': 1,
              'of': 1,
              'potato': 1,
              'sight': 1,
              'sneez': 1,
              'the': 2}),
 defaultdict(int,
             {'bat': 2,
              'can': 1,
              'echoloc': 1,
              'see': 2,
              'sight': 1,
              'sneez': 1,
              'the': 1,
              'via': 1}),
 defaultdict(int,
             {'door': 1,
              'open': 1,
              'she': 1,
              'studio': 1,
              'the': 2,
              'to': 1,
              'wonder': 1})]

### In Scikit-Learn

In [103]:
from sklearn.feature_extraction.text import CountVectorizer

In [104]:
vectorizer = # 코딩 !!

In [112]:
vectors = # 코딩 !!

In [113]:
vectors

<3x20 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [117]:
vectorizer.vocabulary_

{'at': 0,
 'bat': 1,
 'bats': 2,
 'can': 3,
 'door': 4,
 'echolocation': 5,
 'elephant': 6,
 'of': 7,
 'opened': 8,
 'potatoes': 9,
 'see': 10,
 'she': 11,
 'sight': 12,
 'sneeze': 13,
 'sneezed': 14,
 'studio': 15,
 'the': 16,
 'to': 17,
 'via': 18,
 'wondering': 19}

In [114]:
vectors.toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1]],
      dtype=int64)

------------------------

In [None]:
# 패키지에 sklearn_frequency_vectorize 함수를 만들어서 코퍼스에 대해 적용

In [130]:
imp.reload(mynlp)

<module 'mynlp' from '/home/jovyan/work/nlp_ml/ch04/mynlp.py'>

In [132]:
vectors = mynlp.sklearn_frequency_vectorize(corpus)

In [133]:
vectors

<3x20 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

### The Gensim way

In [146]:
import gensim 

In [149]:
corpus_g = # 코딩 !!

In [152]:
corpus_g

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echoloc',
  'see',
  'the',
  'bat',
  'sight',
  'sneez'],
 ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]

In [150]:
id2word = # 코딩 !!

In [151]:
id2word

<gensim.corpora.dictionary.Dictionary at 0x7fd43326c208>

In [163]:
id2word.token2id

{'at': 0,
 'bat': 7,
 'can': 8,
 'door': 12,
 'echoloc': 9,
 'eleph': 1,
 'of': 2,
 'open': 13,
 'potato': 3,
 'see': 10,
 'she': 14,
 'sight': 4,
 'sneez': 5,
 'studio': 15,
 'the': 6,
 'to': 16,
 'via': 11,
 'wonder': 17}

In [153]:
vectors = # 코딩 !!

In [154]:
vectors

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)],
 [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]

----------------------

In [None]:
# 패키지에 gensim_frequency_vectorize 함수를 만들어서 코퍼스에 대해 적용

In [164]:
imp.reload(mynlp)

<module 'mynlp' from '/home/jovyan/work/nlp_ml/ch04/mynlp.py'>

In [165]:
vectors = mynlp.gensim_frequency_vectorize(corpus)

In [166]:
vectors

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)],
 [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]

## One-Hot Encoding
* With NLTK
* In Scikit-Learn
* The Gensim way

### With NLTK

### In Scikit-Learn

### The Gensim way

## Term Frequency-Inverse Document Frequency
* With NLTK
* In Scikit-Learn

### With NLTK

### In Scikit-Learn

### The Gensim way

## Distributed Representation
* The Gensim way

### The Gensim way

# The Scikit-Learn API
* The BaseEstimator Interface
* Extending TransformerMixin

## The BaseEstimator Interface

## Extending TransformerMixin
* Creating a custom Gensim vectorization transformer
* Creating a custom text normalization transformer

### Creating a custom Gensim vectorization transformer

### Creating a custom text normalization transformer

# Pipelines
* Pipeline Basics
* Grid Search for Hyperparameter Optimization
* Enriching Feature Extraction with Feature Unions

## Pipeline Basics

## Grid Search for Hyperparameter Optimization

## Enriching Feature Extraction with Feature Unions