<a href="https://colab.research.google.com/github/seunghyunmoon2/NLP/blob/master/NLP6_WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Email classificaion

using deeplearning, train the model with train email data set and categorize each test data into 1 of 20 categories.

In [None]:
# Email Classification
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk import pos_tag
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# 빠른 시험을 위해 일부 데이터만 사용한다.
x_train = newsgroups_train.data[:2000]
x_test = newsgroups_test.data[:500]
y_train = newsgroups_train.target[:2000]
y_test = newsgroups_test.target[:500]

print ("List of all 20 categories:")
print (newsgroups_train.target_names)
print ("\nSample Email:")
print (x_train[0])
print ("Sample Target Category:")
print (y_train[0])
print (newsgroups_train.target_names[y_train[0]])

def preprocessing(text):
    # string.punctuation : '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text2 = ''
    for ch in text:
        if ch in string.punctuation:
            text2 += ' '
        else:
            text2 += ch
    text2 = ' '.join(text2.split())  # '\n'을 제거하기 위해 split() 후 다시 합침
       
    tokens = []
    for sent in nltk.sent_tokenize(text2):
        for word in nltk.word_tokenize(sent):
            tokens.append(word.lower())
            
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    tokens = [word for word in tokens if len(word)>=3]
    
    #stemmer = PorterStemmer()
    #try:
    #    tokens = [stemmer.stem(word) for word in tokens]
    #except:
    #    tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

x_train_preprocessed  = []
for i in x_train:
    x_train_preprocessed.append(preprocessing(i))

x_test_preprocessed = []
for i in x_test:
    x_test_preprocessed.append(preprocessing(i))
    
x_train_preprocessed[0]

# building TFIDF vectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),
                             stop_words='english',
                             max_features= 10000)

x_train_2 = vectorizer.fit(x_train_preprocessed)
x_train_2 = vectorizer.transform(x_train_preprocessed).todense()
x_test_2 = vectorizer.transform(x_test_preprocessed).todense()
len(vectorizer.get_feature_names())

# Deep Learning modules
np.random.seed(1337) 
nb_classes = len(np.unique(y_train))
batch_size = 64
nb_epochs = 10

Y_train = to_categorical(y_train, nb_classes)
Y_test = to_categorical(y_test, nb_classes)

model = Sequential()

model.add(Dense(1000, input_shape= (x_train_2.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print (model.summary())

hist = model.fit(x_train_2, Y_train, 
                 batch_size=batch_size, 
                 epochs=nb_epochs,
                 validation_data = (x_test_2, Y_test))

# Loss history를 그린다
plt.plot(hist.history['loss'], label='Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend()
plt.title("Loss history")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()

y_train_predclass = model.predict_classes(x_train_2,batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test_2,batch_size=batch_size)

print ("Train accuracy:", np.round(accuracy_score(y_train, y_train_predclass), 3))
print ("Test accuracy:", np.round(accuracy_score(y_test, y_test_predclass), 3))

```
List of all 20 categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





Sample Target Category:
7
rec.autos
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 1000)              10001000  
_________________________________________________________________
activation (Activation)      (None, 1000)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                20020     
_________________________________________________________________
activation_1 (Activation)    (None, 20)                0         
=================================================================
Total params: 10,021,020
Trainable params: 10,021,020
Non-trainable params: 0
_________________________________________________________________
None
Train on 2000 samples, validate on 500 samples
Epoch 1/10
2000/2000 [==============================] - 4s 2ms/sample - loss: 2.8591 - val_loss: 2.6482
Epoch 2/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 1.9795 - val_loss: 1.8801
Epoch 3/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.9325 - val_loss: 1.3282
Epoch 4/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.3953 - val_loss: 1.1097
Epoch 5/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.1820 - val_loss: 1.0185
Epoch 6/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.0980 - val_loss: 0.9770
Epoch 7/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.0585 - val_loss: 0.9563
Epoch 8/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.0395 - val_loss: 0.9434
Epoch 9/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.0288 - val_loss: 0.9348
Epoch 10/10
2000/2000 [==============================] - 2s 1ms/sample - loss: 0.0221 - val_loss: 0.9280



Figures now render in the Plots pane by default. To make them also appear inline in the Console, uncheck "Mute Inline Plotting" under the Plots pane options menu. 


 Train accuracy: 1.0
Test accuracy: 0.726
```

# Word Embedding

A word embedding is a learned representation for text where words that have the same meaning have a similar representation. ... Each word is mapped to one vector and the vector values are learned in a way that resembles a neural network, and hence the technique is often lumped into the field of deep learning.

## get comments from IMDB and tell if each comment is either in positive or negative tone. 0: negative, 1: positive

### WE with CNN

In [None]:
# IMDB 감정 분류 : Word Embedding & CNN
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# set parameters
max_features = 6000 # max_features : 최대 단어수
max_length = 400

# 학습 데이터는 자주 등장하는 단어 6,000개로 구성한다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(len(x_train), 'train observations')
print(len(x_test), 'test observations')
print(x_train[0])  # 6,000 이하의 word index로 구성돼 있다.

wind = imdb.get_word_index()
revind = dict((v,k) for k,v in wind.items())

def decode(sent_list):
    new_words = []
    for i in sent_list:
        # 0 : padding, 1 : 문서 시작, 2 : OOV로 사용함.
        # 실제 word index에서 3을 빼야함.
        # revind에서 i-3을 조회하고, 없으면 '*'로 채우라는 의미.
        new_words.append(revind.get(i-3, '*'))
    comb_words = " ".join(new_words)
    return comb_words

# 문장의 시작은 항상 '*'로 시작할 것임. 중간에 있는 '*'는 OOV일 것임.
decode(x_train[0])
   
# Pad sequences for computational efficiency
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

print(x_train[0])
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

# 각 문장의 OOV 개수 확인.
(x_train[0] == 2).sum()
(x_train[1] == 2).sum()

# Deep Learning architecture parameters
batch_size = 32
embedding_dims = 60
num_kernels = 260        # convolution filter 개수
kernel_size = 3          # convolution filter size
hidden_dims = 300
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=max_length))
model.add(Dropout(0.2))
model.add(Conv1D(num_kernels, kernel_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
print(model.summary())

# 학습
hist = model.fit(x_train, y_train, 
                 batch_size=batch_size, 
                 epochs=epochs,
                 validation_data = (x_test, y_test))

# Loss history를 그린다
plt.plot(hist.history['loss'], label='Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend()
plt.title("Loss history")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()

# 성능 확인
y_train_predclass = model.predict_classes(x_train, batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test, batch_size=batch_size)

y_train_predclass.shape = y_train.shape
y_test_predclass.shape = y_test.shape

print (("Train accuracy:"),(np.round(accuracy_score(y_train,y_train_predclass),3)))  
print (("Test accuracy:"),(np.round(accuracy_score(y_test,y_test_predclass),3)))  

In [None]:
Train accuracy: 0.999
Test accuracy: 0.886

### WE with LSTM

In [None]:
# IMDB 감정 분류 : Word Embedding & Bidirectional LSTM
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Bidirectional, LSTM
from tensorflow.keras.datasets import imdb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

max_features = 6000
max_length = 400

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

xInput = Input(batch_shape=(None, max_length))
xEmbed = Embedding(max_features, 60, input_length = max_length)(xInput)
xLstm = Bidirectional(LSTM(64))(xEmbed)
xOutput = Dense(1, activation='sigmoid')(xLstm)
model = Model(xInput, xOutput)
model.compile(loss='binary_crossentropy', optimizer='adam')

embedmodel = Model(xInput,xEmbed)


# 학습
hist = model.fit(x_train, y_train, 
                 batch_size=32, 
                 epochs=10,
                 validation_data = (x_test, y_test))

# Loss history를 그린다
plt.plot(hist.history['loss'], label='Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend()
plt.title("Loss history")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()

y_hat = model.predict(x_test, batch_size=32)
y_hat_class = np.round(y_hat, 0)
y_hat_class.shape = y_test.shape

print (("Test accuracy:"),(np.round(accuracy_score(y_test,y_hat_class),3)))

In [None]:
Test accuracy: 0.851

### Cleaner Code with better explanation

In [None]:
# IMDB Classification using Word Embedding and Conv1D
# ----------------------------------------------------
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
import numpy as np

max_features = 6000    # max_features : 최대 단어수
max_length = 400       # 한 개 리뷰 문서의 최대 단어 길이

# IMDB 데이터를 읽어온다.
# IMDB 데이터에 사용된 총 단어의 종류는 88,584개 (vocabulary 크기)이다.
# IMDB 학습데이터와 시험데이터에는 빈도가 높은 단어 6,000개의 index가 표시돼 있다.
# vocabulary의 6,000번째 이후 데이터는 out-of-vocabulary 표시인 '2'가 표시돼 있다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# 첫 번째 리뷰 문서 x_train[0]의 내용을 확인한다.
# 0 : padding, 1 : start, 2 : OOV, 3 : Invalid를 의미한다.
print(x_train[0])  # 6,000 이하의 word index로 구성돼 있다.

# vocabulary를 생성한다.
# word2idx : {'단어' : idx} 구조
# idx2word : {idx : '단어'} 구조
word2idx = imdb.get_word_index()
idx2word = dict((v,k) for k,v in word2idx.items())

# volcaburary idx는 1부터 시작한다. idx2word[1] = 'the'
# x_train에는 단어들이 vocabulary의 index로 표시돼 있다.
# 그러나 idx2word에는 padding=0, start=1, OOV=2, Invalid=3은 포함돼 있지 않다.
# idx2word의 idx를 3증가 시키고, 아래와 같이 0, 1, 2, 3을 추가한다.
idx2word = dict((v+3, k) for k, v in word2idx.items())
idx2word[0] = '<PAD>'  # padding 문자 표시
idx2word[1] = '<START>'  # start 문자 표시
idx2word[2] = '<OOV>'  # OOV 문자 표시
idx2word[3] = '<INV>'  # Invalid 문자 표시
word2idx = dict((k, v) for v, k in idx2word.items())

# 숫자로 표시된 x_train을 실제 단어로 변환해서 육안으로 확인해 본다.
# 학습과는 무관하다.
def decode(review):
    x = [idx2word[s] for s in review]
    return ' '.join(x)
decode(x_train[0])
####### 여기까지가 주어진 데이터에 관한 부분이다.
   

# 1개 리뷰 문서의 단어 개수를 max_length = 400으로 맞춘다.
# 400개 보다 작으면 padding = 0을 추가하고, 400개 보다 크면 뒷 부분을 자른다.
# sequence()에 인자 전달해서 패딩을 뒤쪽에 줄 수도 있다.
# RNN의 경우 패딩이 앞에 있는게 유리 - sequantial하기에 0이 훈련중인 단어 앞에(멀리) 있어 vanishing gradient에 유리
# CNN의 경우 별 상관 없다.
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

# Deep Learning architecture parameters
batch_size = 32
embedding_dims = 60
num_kernels = 260        # convolution filter 개수
kernel_size = 3          # convolution filter size
hidden_dims = 300
epochs = 10

xInput = Input(batch_shape = (None, max_length))
emb = Embedding(max_features, embedding_dims)(xInput)
emb = Dropout(0.5)(emb)
conv = Conv1D(num_kernels, kernel_size, padding='valid', activation='relu', strides=1)(emb)
conv = GlobalMaxPooling1D()(conv)
ffn = Dense(hidden_dims)(conv)
ffn = Dropout(0.5)(ffn)
ffn = Activation('relu')(ffn)
ffn = Dense(1)(ffn)
yOutput = Activation('sigmoid')(ffn)

model = Model(xInput, yOutput)
model.compile(loss='binary_crossentropy', optimizer='adam')
print(model.summary())

# 학습
hist = model.fit(x_train, y_train, 
                 batch_size=batch_size, 
                 epochs=epochs,
                 validation_data = (x_test, y_test))

# Loss history를 그린다
plt.plot(hist.history['loss'], label='Train loss')
plt.plot(hist.history['val_loss'], label = 'Test loss')
plt.legend()
plt.title("Loss history")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()

# 성능 확인
y_pred = model.predict(x_test)
y_pred = np.where(y_pred > 0.5, 1, 0)
print ("Test accuracy:", accuracy_score(y_test, y_pred))

# 1. 특정 단어의 Embedding vector를 확인한다.
# ------------------------------------------
# Embedding layer의 W를 읽어온다.
# 이것이 6,000개 단어에 대한 word embedding vector가 된다.
w_emb = np.array(model.layers[1].get_weights())       # shape = (1, 6000, 60)
w_emb = w_emb.reshape(max_features, embedding_dims)   # shape = (6000, 60)

# father - mother - daughter - son 간의 거리를 측정한다.
father = w_emb[word2idx['father']]
mother = w_emb[word2idx['mother']]
daughter = w_emb[word2idx['daughter']]
son = w_emb[word2idx['son']]
euclidean_distances([father, mother, daughter, son])

# 2. 특정 문장의 Embedding vector를 확인한다.
# ------------------------------------------
embModel = Model(xInput, emb)
m = embModel.predict(x_train[0].reshape(1, max_length))
m.shape # (1, 400, 60)

```euclidean_distances([father, mother, daughter, son])```

shows how close 'father' vector is to 'mother', 'son' and 'daughter' vectors
```
array([[0.        , 1.0991806 , 1.023643  , 0.8721574 ],
       [1.0991806 , 0.        , 0.96199644, 1.1721474 ],
       [1.023643  , 0.96199644, 0.        , 1.2364552 ],
       [0.8721574 , 1.1721474 , 1.2364552 , 0.        ]], dtype=float32)
```
1. father-son      : 0.8721574
2. father-daughter : 1.023643
3. father-mother   : 1.0991806