In [2]:
from nltk import sent_tokenize, word_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from scipy import sparse
import nltk
import numpy as np

In [2]:
text_sample = 'As it continues to spread around the world, bringing panic with it, scientists are striving to develop ways of fighting this previously unknown threat. \
Sophisticated computer modelling is being used to track and predict its transmission, while virologists are attempting to engineer a vaccine. \
Others are seeking drug treatments that can help those who fall ill with the infection.'
sentences = sent_tokenize(text_sample)
print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['As it continues to spread around the world, bringing panic with it, scientists are striving to develop ways of fighting this previously unknown threat.', 'Sophisticated computer modelling is being used to track and predict its transmission, while virologists are attempting to engineer a vaccine.', 'Others are seeking drug treatments that can help those who fall ill with the infection.']


# # tokenize

In [4]:
words = word_tokenize(text_sample)
print(words)

['As', 'it', 'continues', 'to', 'spread', 'around', 'the', 'world', ',', 'bringing', 'panic', 'with', 'it', ',', 'scientists', 'are', 'striving', 'to', 'develop', 'ways', 'of', 'fighting', 'this', 'previously', 'unknown', 'threat', '.', 'Sophisticated', 'computer', 'modelling', 'is', 'being', 'used', 'to', 'track', 'and', 'predict', 'its', 'transmission', ',', 'while', 'virologists', 'are', 'attempting', 'to', 'engineer', 'a', 'vaccine', '.', 'Others', 'are', 'seeking', 'drug', 'treatments', 'that', 'can', 'help', 'those', 'who', 'fall', 'ill', 'with', 'the', 'infection', '.']


In [5]:
def tokenize_text(text):
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

word_tokens = tokenize_text(text_sample)
print(word_tokens)

[['As', 'it', 'continues', 'to', 'spread', 'around', 'the', 'world', ',', 'bringing', 'panic', 'with', 'it', ',', 'scientists', 'are', 'striving', 'to', 'develop', 'ways', 'of', 'fighting', 'this', 'previously', 'unknown', 'threat', '.'], ['Sophisticated', 'computer', 'modelling', 'is', 'being', 'used', 'to', 'track', 'and', 'predict', 'its', 'transmission', ',', 'while', 'virologists', 'are', 'attempting', 'to', 'engineer', 'a', 'vaccine', '.'], ['Others', 'are', 'seeking', 'drug', 'treatments', 'that', 'can', 'help', 'those', 'who', 'fall', 'ill', 'with', 'the', 'infection', '.']]


# # stopwords

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)

[['continues', 'spread', 'around', 'world', ',', 'bringing', 'panic', ',', 'scientists', 'striving', 'develop', 'ways', 'fighting', 'previously', 'unknown', 'threat', '.'], ['sophisticated', 'computer', 'modelling', 'used', 'track', 'predict', 'transmission', ',', 'virologists', 'attempting', 'engineer', 'vaccine', '.'], ['others', 'seeking', 'drug', 'treatments', 'help', 'fall', 'ill', 'infection', '.']]


# # Stemming, Lemmatization

In [11]:
stemmer = LancasterStemmer()
lemma = WordNetLemmatizer()

print(stemmer.stem('amusing'))
print(lemma.lemmatize('amusing', 'v'))

amus
amuse


# # BOW

보통 BOW를 하면 sparse matrix형식을 띄게 되는데 물리적으로 적은 메모리 공간을 차지할 수 있도록 변환 해주는 방법이 있다. <br>
1. COO <br>
2. CSR (보통 이 방법을 많이 사용)

In [5]:
### COO
dense = np.array([[3, 0, 1], [0, 2, 0]])

# 0이 아닌 데이터
data = np.array([3, 1, 2])

# 행 위ㅣ와 열 위치를 각각 배열로 생성
row_pos = np.array([0, 0, 1])
col_pos = np.array([0, 2, 1])

# COO
sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [6]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

In [7]:
### CPR
# 행 위치에 대한 index를 다시 나타내서 메모리를 줄인다.
dense2 = np.array([[0,0,1,0,0,5],
             [1,4,0,3,2,5],
             [0,6,0,3,0,0],
             [2,0,0,0,0,0],
             [0,0,0,7,0,8],
             [1,0,0,0,0,0]])

data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1])

row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])
col_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])

# COO
sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))

# 행 위치 배열의 고유한 값의 시작 위치 인덱스. 마지막 숫자는 행 위치 배열의 크기
row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])

# CSR
sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))

print(sparse_coo.toarray())
print(sparse_csr.toarray())

[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


In [9]:
# to be simple
coo = sparse.coo_matrix(dense2)
csr = sparse.csr_matrix(dense2)