# 전처리

### 데이터 가져오기

In [25]:
with open('data/echodot_sample.csv', encoding="utf8") as f:
    text = f.read().strip()
    print(text[:100])

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 80360-80361: invalid continuation byte

### 정규화(normalization)

In [2]:
import re
letters_only = re.sub('[^a-zA-Z]', ' ', text)
letters_only[:100]

' Sound quality is the best I ve heard on a speaker of this size     Definitely worth paying a little'

In [3]:
lower_case = letters_only.lower()
words = lower_case.split()
print(len(words))
words[:10]

436


['sound', 'quality', 'is', 'the', 'best', 'i', 've', 'heard', 'on', 'a']

### 불용어 제거(Stopwords removal)

In [6]:
#pip install nltk

In [8]:
import nltk

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songhannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [11]:
words = [w for w in words if not w in stopwords.words('english')]
print(len(words))
words[:10]

198


['sound',
 'quality',
 'best',
 'heard',
 'speaker',
 'size',
 'definitely',
 'worth',
 'paying',
 'little']

### 어간추출(stemming)

In [12]:
stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem('maximum'))
print("The stemmed form of running is: {}".format(stemmer.stem("running")))

maximum
The stemmed form of running is: run


In [13]:
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('maximum'))
print("The stemmed form of running is: {}".format(lancaster_stemmer.stem("running")))

maxim
The stemmed form of running is: run


In [14]:
words[:10]

['sound',
 'quality',
 'best',
 'heard',
 'speaker',
 'size',
 'definitely',
 'worth',
 'paying',
 'little']

In [15]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(w) for w in words]

stemmed_words[:10]

['sound',
 'qualiti',
 'best',
 'heard',
 'speaker',
 'size',
 'definit',
 'worth',
 'pay',
 'littl']

### 음소표기(Lemmatization)

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/songhannah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

print(wordnet_lemmatizer.lemmatize('fly'))
print(wordnet_lemmatizer.lemmatize('flies'))

lemm_words = [wordnet_lemmatizer.lemmatize(w) for w in words]
lemm_words

fly
fly


['sound',
 'quality',
 'best',
 'heard',
 'speaker',
 'size',
 'definitely',
 'worth',
 'paying',
 'little',
 'extra',
 'one',
 'nd',
 'generation',
 'model',
 'going',
 'discus',
 'functionality',
 'ton',
 'info',
 'already',
 'one',
 'important',
 'note',
 'set',
 'alexa',
 'app',
 'phone',
 'import',
 'phone',
 'contact',
 'think',
 'gave',
 'permission',
 'alexa',
 'indeed',
 'import',
 'contact',
 'may',
 'read',
 'story',
 'without',
 'knowledge',
 'alexa',
 'record',
 'said',
 'home',
 'send',
 'contact',
 'yikes',
 'want',
 'kid',
 'accidentally',
 'send',
 'message',
 'call',
 'people',
 'contact',
 'list',
 'want',
 'keep',
 'contact',
 'alexa',
 'prevent',
 'happening',
 'installiing',
 'alexa',
 'app',
 'phone',
 'export',
 'contact',
 'file',
 'save',
 'later',
 'go',
 'back',
 'phone',
 'delete',
 'contact',
 'worry',
 'import',
 'back',
 'later',
 'install',
 'alexa',
 'app',
 'phone',
 'go',
 'set',
 'process',
 'point',
 'option',
 'allow',
 'alexa',
 'automatically',


# 피처 엔지니어링

### ngrams

In [22]:
from nltk import ngrams

In [23]:
sentence = re.sub('[^a-zA-Z]', ' ', text)
sentence[:100]

' Sound quality is the best I ve heard on a speaker of this size     Definitely worth paying a little'

In [24]:
n=4
ngramsres = ngrams(sentence.split(), n)

In [85]:
for grams in ngramsres:
    print(grams)

('Sound', 'quality', 'is', 'the')
('quality', 'is', 'the', 'best')
('is', 'the', 'best', 'I')
('the', 'best', 'I', 've')
('best', 'I', 've', 'heard')
('I', 've', 'heard', 'on')
('ve', 'heard', 'on', 'a')
('heard', 'on', 'a', 'speaker')
('on', 'a', 'speaker', 'of')
('a', 'speaker', 'of', 'this')
('speaker', 'of', 'this', 'size')
('of', 'this', 'size', 'Definitely')
('this', 'size', 'Definitely', 'worth')
('size', 'Definitely', 'worth', 'paying')
('Definitely', 'worth', 'paying', 'a')
('worth', 'paying', 'a', 'little')
('paying', 'a', 'little', 'extra')
('a', 'little', 'extra', 'for')
('little', 'extra', 'for', 'this')
('extra', 'for', 'this', 'one')
('for', 'this', 'one', 'over')
('this', 'one', 'over', 'the')
('one', 'over', 'the', 'nd')
('over', 'the', 'nd', 'generation')
('the', 'nd', 'generation', 'model')
('nd', 'generation', 'model', 'I')
('generation', 'model', 'I', 'm')
('model', 'I', 'm', 'not')
('I', 'm', 'not', 'going')
('m', 'not', 'going', 'to')
('not', 'going', 'to', 'disc

### BOW

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [88]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,2), min_df=1)

In [90]:
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])

In [92]:
ngram_vectorizer.get_feature_names() == (['w', 'ds', 'or', 'pr', 'rd', 's', 'wo', 'wp'])
print(counts.toarray().astype(int))

[[1 1 1 0 1 1 1 0]
 [1 1 0 1 1 1 0 1]]


# WORD2VEC

In [100]:
from gensim import models

#### Word2vec 학습시키기

In [124]:
# https://github.com/ml5js/training-word2vec

### t-SNE로 시각화

In [116]:
# https://programmers.co.kr/learn/courses/21/lessons/1698

# Clustering