<a href="https://colab.research.google.com/github/seunghyunmoon2/NLP/blob/master/NLP9_EmbeddingLayer_Konlpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dynamic Memory Network을 이용한 Q&A 데이터 학습

In [None]:
# Dynamic Memory Network을 이용한 Q&A 데이터 학습
# ----------------------------------------------
import collections
import itertools
import nltk
import numpy as np
import matplotlib.pyplot as plt
import random
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.layers import LSTM, Permute
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Add, Concatenate, Dot
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# 문서 내용 예시 :
# 1 Mary moved to the bathroom.\n
# 2 Daniel went to the garden.\n
# 3 Where is Mary?\tbathroom\t1
#
# Return:
# Stories = ['Mary moved to the bathroom.\n', 'John went to the hallway.\n']
# questions = 'Where is Mary? '
# answers = 'bathroom'
#----------------------------------------------------------------------------
def get_data(infile):
    stories, questions, answers = [], [], []
    story_text = []
    fin = open(Train_File, "r") 
    for line in fin:
        lno, text = line.split(" ", 1)
        if "\t" in text:
            question, answer, _ = text.split("\t")
            stories.append(story_text)
            questions.append(question)
            answers.append(answer)
            story_text = []
        else:
            story_text.append(text)
    fin.close()
    return stories, questions, answers

Train_File = "./dataset/qa1_single-supporting-fact_train.txt"  # githu.com/seunghyunmoon2/NLP
Test_File = "./dataset/qa1_single-supporting-fact_test.txt"    # githu.com/seunghyunmoon2/NLP

# get the data
data_train = get_data(Train_File)
data_test = get_data(Test_File)
print("\n\nTrain observations:",len(data_train[0]),"Test observations:", len(data_test[0]),"\n\n")

# Building Vocab dictionary from Train & Test data
dictnry = collections.Counter()
for stories, questions, answers in [data_train, data_test]:
    for story in stories:
        for sent in story:
            for word in nltk.word_tokenize(sent):
                dictnry[word.lower()] +=1
    for question in questions:
        for word in nltk.word_tokenize(question):
            dictnry[word.lower()]+=1
    for answer in answers:
        for word in nltk.word_tokenize(answer):
            dictnry[word.lower()]+=1

word2indx = {w:(i+1) for i,(w,_) in enumerate(dictnry.most_common())}
word2indx["PAD"] = 0
indx2word = {v:k for k,v in word2indx.items()}

vocab_size = len(word2indx)
print("vocabulary size:",len(word2indx))
print(word2indx)

# compute max sequence length for each entity
story_maxlen = 0
question_maxlen = 0

for stories, questions, answers in [data_train, data_test]:
    for story in stories:
        story_len = 0
        for sent in story:
            swords = nltk.word_tokenize(sent)
            story_len += len(swords)
        if story_len > story_maxlen:
            story_maxlen = story_len
            
    for question in questions:
        question_len = len(nltk.word_tokenize(question))
        if question_len > question_maxlen:
            question_maxlen = question_len
            
print ("Story maximum length:", story_maxlen, "Question maximum length:", question_maxlen)

# Converting data into Vectorized form
def data_vectorization(data, word2indx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    stories, questions, answers = data
    for story, question, answer in zip(stories, questions, answers):
        xs = [[word2indx[w.lower()] for w in nltk.word_tokenize(s)] for s in story]
        xs = list(itertools.chain.from_iterable(xs))
        xq = [word2indx[w.lower()] for w in nltk.word_tokenize(question)]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(word2indx[answer.lower()])
    return pad_sequences(Xs, maxlen=story_maxlen), pad_sequences(Xq, maxlen=question_maxlen),\
           to_categorical(Y, num_classes=len(word2indx))

           
Xstrain, Xqtrain, Ytrain = data_vectorization(data_train, word2indx, story_maxlen, question_maxlen)
Xstest, Xqtest, Ytest = data_vectorization(data_test, word2indx, story_maxlen, question_maxlen)

print("Train story",Xstrain.shape,"Train question", Xqtrain.shape,"Train answer", Ytrain.shape)
print( "Test story",Xstest.shape, "Test question",Xqtest.shape, "Test answer",Ytest.shape)

# Model Parameters
EMBEDDING_SIZE = 128
LATENT_SIZE = 64
BATCH_SIZE = 64
NUM_EPOCHS = 40

# Inputs
story_input = Input(shape=(story_maxlen,))
question_input = Input(shape=(question_maxlen,))

# Story encoder embedding
story_encoder = Embedding(input_dim=vocab_size,
                          output_dim=EMBEDDING_SIZE, 
                          input_length=story_maxlen)(story_input)
story_encoder = Dropout(0.2)(story_encoder)

# Question encoder embedding
question_encoder = Embedding(input_dim=vocab_size,
                             output_dim=EMBEDDING_SIZE,
                             input_length=question_maxlen)(question_input)
question_encoder = Dropout(0.3)(question_encoder)

# Match between story and question
# story_encoder = [None, 14, 128], question_encoder = [None, 4, 128]
# match = [None, 14, 4]
match = Dot(axes=[2, 2])([story_encoder, question_encoder])

# Encode story into vector space of question
story_encoder_c = Embedding(input_dim=vocab_size,
                            output_dim=question_maxlen,
                            input_length=story_maxlen)(story_input)
story_encoder_c = Dropout(0.3)(story_encoder_c)

# Combine match and story vectors
response = Add()([match, story_encoder_c])
response = Permute((2, 1))(response)

# Combine response and question vectors to answers space
answer = Concatenate()([response, question_encoder])
answer = LSTM(LATENT_SIZE)(answer)
answer = Dropout(0.2)(answer)
answer = Dense(vocab_size)(answer)
output = Activation("softmax")(answer)

model = Model(inputs=[story_input, question_input], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy")
print (model.summary())

# Model Training
history = model.fit([Xstrain, Xqtrain], [Ytrain],
                    batch_size = BATCH_SIZE, 
                    epochs = NUM_EPOCHS,
                    validation_data=([Xstest, Xqtest], [Ytest]))
					
# loss plot
plt.title("Episodic Memory Q & A Loss")
plt.plot(history.history["loss"], color="g", label="train")
plt.plot(history.history["val_loss"], color="r", label="validation")
plt.legend(loc="best")
plt.show()

# get predictions of labels
ytest = np.argmax(Ytest, axis=1)
Ytest_ = model.predict([Xstest, Xqtest])
ytest_ = np.argmax(Ytest_, axis=1)

# Select Random questions and predict answers
NUM_DISPLAY = 10
   
for i in random.sample(range(Xstest.shape[0]),NUM_DISPLAY):
    story = " ".join([indx2word[x] for x in Xstest[i].tolist() if x != 0])
    question = " ".join([indx2word[x] for x in Xqtest[i].tolist()])
    label = indx2word[ytest[i]]
    prediction = indx2word[ytest_[i]]
    print(story, question, label, prediction)
     
    
    
	

## output

- output

```
Train observations: 10000 Test observations: 10000 

vocabulary size: 22
{'to': 1, 'the': 2, '.': 3, 'where': 4, 'is': 5, '?': 6, 'went': 7, 'mary': 8, 'john': 9, 'sandra': 10, 'daniel': 11, 'bathroom': 12, 'hallway': 13, 'office': 14, 'kitchen': 15, 'garden': 16, 'bedroom': 17, 'journeyed': 18, 'travelled': 19, 'back': 20, 'moved': 21, 'PAD': 0}

Story maximum length: 14 Question maximum length: 4

Train story (10000, 14) Train question (10000, 4) Train answer (10000, 22)
Test story (10000, 14) Test question (10000, 4) Test answer (10000, 22)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            [(None, 14)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 14, 128)      2816        input_2[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 4, 128)       2816        input_3[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 14, 128)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4, 128)       0           embedding_2[0][0]                
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 14, 4)        88          input_2[0][0]                    
__________________________________________________________________________________________________
dot (Dot)                       (None, 14, 4)        0           dropout[0][0]                    
                                                                 dropout_1[0][0]                  
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 14, 4)        0           embedding_3[0][0]                
__________________________________________________________________________________________________
add (Add)                       (None, 14, 4)        0           dot[0][0]                        
                                                                 dropout_2[0][0]                  
__________________________________________________________________________________________________
permute (Permute)               (None, 4, 14)        0           add[0][0]                        
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 4, 142)       0           permute[0][0]                    
                                                                 dropout_1[0][0]                  
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 64)           52992       concatenate[0][0]                
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 64)           0           lstm[0][0]                       
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 22)           1430        dropout_3[0][0]                  
__________________________________________________________________________________________________
activation (Activation)         (None, 22)           0           dense_2[0][0]                    
==================================================================================================
Total params: 60,142
Trainable params: 60,142
Non-trainable params: 0
__________________________________________________________________________________________________



 john journeyed to the bathroom . daniel travelled to the bedroom . where is daniel ? bedroom bedroom
daniel moved to the kitchen . daniel went back to the office . where is daniel ? office office
sandra moved to the hallway . john travelled to the hallway . where is sandra ? hallway hallway
john journeyed to the garden . mary travelled to the hallway . where is mary ? hallway hallway
john moved to the garden . sandra went back to the hallway . where is sandra ? hallway hallway
mary went to the office . daniel journeyed to the bedroom . where is mary ? office office
john travelled to the hallway . daniel went to the kitchen . where is daniel ? kitchen kitchen
mary went back to the kitchen . sandra moved to the hallway . where is sandra ? hallway hallway
john went back to the garden . mary travelled to the bedroom . where is mary ? bedroom bedroom
mary went to the hallway . sandra went to the bedroom . where is daniel ? bedroom bedroom
```

# Predict next word

In [None]:
# Predict next word
# -----------------
from tensorflow.keras.layers import Input,Dense,Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
import string
import random

# File reading
with open("./dataset/alice_in_wonderland.txt", 'r') as content_file:  #github.com/seunghyunmoon2/NLP
    content = content_file.read()

content2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in content]).split())
 
tokens = nltk.word_tokenize(content2)
tokens = [word.lower() for word in tokens if len(word)>=2]

# Select value of N for N grams among which N-1 are used to predict
# last N word
N = 3
quads = list(nltk.ngrams(tokens,N))

newl_app = []
for ln in quads:
    newl = " ".join(ln)        
    newl_app.append(newl)

# Vectorizing the words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

x_trigm = []
y_trigm = []

for l in newl_app:
    x_str = " ".join(l.split()[0:N-1])
    y_str = l.split()[N-1]   
    x_trigm.append(x_str)
    y_trigm.append(y_str)

x_trigm_check = vectorizer.fit_transform(x_trigm).todense()
y_trigm_check = vectorizer.fit_transform(y_trigm).todense()

# Dictionaries from word to integer and integer to word
dictnry = vectorizer.vocabulary_
rev_dictnry = {v:k for k,v in dictnry.items()}

X = np.array(x_trigm_check)
Y = np.array(y_trigm_check)

Xtrain, Xtest, Ytrain, Ytest,xtrain_tg,xtest_tg = train_test_split(X, Y,x_trigm, test_size=0.3,random_state=42)

print("X Train shape",Xtrain.shape, "Y Train shape" , Ytrain.shape)
print("X Test shape",Xtest.shape, "Y Test shape" , Ytest.shape)

# Model Building
BATCH_SIZE = 128
NUM_EPOCHS = 100

input_layer = Input(shape = (Xtrain.shape[1],),name="input")
first_layer = Dense(1000,activation='relu',name = "first")(input_layer)
first_dropout = Dropout(0.5,name="firstdout")(first_layer)

second_layer = Dense(800,activation='relu',name="second")(first_dropout)

third_layer = Dense(1000,activation='relu',name="third")(second_layer)
third_dropout = Dropout(0.5,name="thirdout")(third_layer)

fourth_layer = Dense(Ytrain.shape[1],activation='softmax',name = "fourth")(third_dropout)


history = Model(input_layer,fourth_layer)
history.compile(optimizer = "adam",loss="categorical_crossentropy",metrics=["accuracy"])

print(history.summary())

# Model Training
history.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,epochs=NUM_EPOCHS, verbose=1,validation_split = 0.2)

# Model Prediction
Y_pred = history.predict(Xtest)


# Sample check on Test data
print ("Prior bigram words","|Actual","|Predicted","\n")

for i in range(10):
    print (i,xtest_tg[i],"|",rev_dictnry[np.argmax(Ytest[i])],"|",rev_dictnry[np.argmax(Y_pred[i])])

NUM_DISPLAY = 10
for i in random.sample(range(len(xtest_tg)), NUM_DISPLAY):
	print (i,xtest_tg[i],"|",rev_dictnry[np.argmax(Ytest[i])],"|",rev_dictnry[np.argmax(Y_pred[i])])

## output

- output

print("X Train shape",Xtrain.shape, "Y Train shape" , Ytrain.shape)
print("X Test shape",Xtest.shape, "Y Test shape" , Ytest.shape)
```
X Train shape (17947, 2559) Y Train shape (17947, 2559)
X Test shape (7692, 2559) Y Test shape (7692, 2559)
```
print(history.summary())
```
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           [(None, 2559)]            0         
_________________________________________________________________
first (Dense)                (None, 1000)              2560000   
_________________________________________________________________
firstdout (Dropout)          (None, 1000)              0         
_________________________________________________________________
second (Dense)               (None, 800)               800800    
_________________________________________________________________
third (Dense)                (None, 1000)              801000    
_________________________________________________________________
thirdout (Dropout)           (None, 1000)              0         
_________________________________________________________________
fourth (Dense)               (None, 2559)              2561559   
=================================================================
Total params: 6,723,359
Trainable params: 6,723,359
Non-trainable params: 0
_________________________________________________________________
```
print ("Prior bigram words","|Actual","|Predicted","\n")
```
Prior bigram words |Actual |Predicted 

0 the evening | beautiful | beautiful
1 slipped in | like | sulky
2 alice swallowing | down | not
3 an encouraging | tone | about
4 waistcoat pocket | or | in
5 she went | on | on
6 that she | knew | was
7 down on | her | one
8 dormouse went | on | on
9 soup soup | of | and
1368 on here | thought | the
1970 the whole | head | party
29 and people | began | come
4554 turtle story | you | you
6535 alice waited | till | little
5298 gloves while | she | she
2447 pocket till | she | she
6193 pigs and | was | in
7351 elbow was | pressed | on
1082 wind and | the | was
```

# Konlpy

## 개발환경셋업

1. [openJDK](https://blog.naver.com/rudnfskf2/221490709675)
2. conda install -c conda-forge jpype1 # pip install jpype1
3. pip install konlpy

In [None]:
import tensorflow as tf
from tensorflow.keras import preprocessing

samples = ['너 오늘 이뻐 보인다', 
           '나는 오늘 기분이 더러워', 
           '끝내주는데, 좋은 일이 있나봐', 
           '나 좋은 일이 생겼어', 
           '아 오늘 진짜 짜증나', 
           '환상적인데, 정말 좋은거 같아']

label = [[1], [0], [1], [1], [0], [1]]

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
word_index = tokenizer.word_index

print("\n수치화된 텍스트 데이터 :\n", sequences)
print("\n각 단어의 인덱스 :\n", word_index)
print("\n라벨: ", label)
print()

BATCH = 4
EPOCHS = 2
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))\
            .batch(BATCH)\
            .shuffle(len(sequences))\
            .repeat(EPOCHS)
   
for seq, lab in dataset:
    print(seq.numpy(), lab.numpy())
    print()


수치화된 텍스트 데이터 :
 [[4, 1, 5, 6], [7, 1, 8, 9], [10, 2, 3, 11], [12, 2, 3, 13], [14, 1, 15, 16], [17, 18, 19, 20]]

각 단어의 인덱스 :
 {'오늘': 1, '좋은': 2, '일이': 3, '너': 4, '이뻐': 5, '보인다': 6, '나는': 7, '기분이': 8, '더러워': 9, '끝내주는데': 10, '있나봐': 11, '나': 12, '생겼어': 13, '아': 14, '진짜': 15, '짜증나': 16, '환상적인데': 17, '정말': 18, '좋은거': 19, '같아': 20}

라벨:  [[1], [0], [1], [1], [0], [1]]

[[14  1 15 16]
 [17 18 19 20]] [[0]
 [1]]

[[ 4  1  5  6]
 [ 7  1  8  9]
 [10  2  3 11]
 [12  2  3 13]] [[1]
 [0]
 [1]
 [1]]

[[ 4  1  5  6]
 [ 7  1  8  9]
 [10  2  3 11]
 [12  2  3 13]] [[1]
 [0]
 [1]
 [1]]

[[14  1 15 16]
 [17 18 19 20]] [[0]
 [1]]



## Keras Embedding layer 분석

In [None]:
# 책 p.42 예제
# Keras Embedding layer 분석
# --------------------------
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import Input, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
    
samples = ['너 오늘 이뻐 보인다', 
           '나는 오늘 기분이 더러워', 
           '끝내주는데, 좋은 일이 있나봐', 
           '나 좋은 일이 생겼어', 
           '아 오늘 진짜 짜증나', 
           '환상적인데, 정말 좋은거 같아']
labels = [[1], [0], [1], [1], [0], [1]]

# sample 문서를 문장별로 수치화한다. 문서에 사용된 모든 단어들로 사전을
# 구성하고 (vocaburary), 사전의 인덱스를 이용해서 각 문장에 사용된 
# 단어들을 표시한다. 즉, 한 문장은 사전에 수록된 단어들의 인덱스이다.
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
word_index = tokenizer.word_index
idx2word = {v:k for (k, v) in word_index.items()}
print("sequences = ", sequences)    # 문장을 사전의 인덱스로 표현한 것
print("word_index = ", word_index)  # 단어 사전 dictionary

sequences = np.array(sequences)
labels = np.array(labels)

# Embedding layer 내부의 입력층 개수임. +1은 OOV (out of vocaburary)용임.
VOCAB_SIZE = len(word_index) + 1

# Embedding layer 내부의 출력층 개수임. 단어의 latent feature 개수
EMB_SIZE = 8

# 딥러닝 모델을 빌드한다.
xInput = Input(batch_shape=(None, sequences.shape[1]))
embed_input = Embedding(input_dim=VOCAB_SIZE, output_dim=EMB_SIZE)(xInput)
embed_input1 = tf.reduce_mean(embed_input, axis=-1)

hidden_layer = Dense(128, activation=tf.nn.relu)(embed_input1)
output = Dense(1, activation='sigmoid')(hidden_layer)
model = Model(xInput, output)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01))

# 학습
model.fit(sequences, labels, epochs=100)

# 추정
pred = model.predict(sequences)
print(np.round(pred, 0))

# Embedding layer의 weights를 관찰한다.
# Embedding layer에는 bias가 없다. Bias를 추가하려면 output_dim = 1인
# 별도의 embedding layer를 사용해야 한다.
w = model.layers[1].get_weights()
w = np.array(w)
print(w.shape)

# 첫 문장의 word embedding 결과를 확인한다.
latent = Model(xInput, embed_input)
z = latent.predict(sequences)
print(z.shape)
print(np.round(z[0], 2))

# 단어 유사도. '너', '오늘'의 유사도
# 단어가 Embedding layer에 들어갈 때 처음에는 vocaburary 크기만큼의
# one-hot vector로 들어간다. one-hot vector들은 모두 내적이 0이므로
# 유사도가 모두 0이다. 즉, 모든 단어들이 독립적이다. 그러나, 학습이
# 완료된 후 embedding 출력은 단어들의 latent feature들이고, 내적이
# 존재한다. 이것이 단어들 간의 유사도이다. 단어들이 문맥적 종속 관계에 있다.
print("'너'와 '오늘'의 유사도 = ", np.dot(z[0][0], z[0][1]))


sequences =  [[4, 1, 5, 6], [7, 1, 8, 9], [10, 2, 3, 11], [12, 2, 3, 13], [14, 1, 15, 16], [17, 18, 19, 20]]
word_index =  {'오늘': 1, '좋은': 2, '일이': 3, '너': 4, '이뻐': 5, '보인다': 6, '나는': 7, '기분이': 8, '더러워': 9, '끝내주는데': 10, '있나봐': 11, '나': 12, '생겼어': 13, '아': 14, '진짜': 15, '짜증나': 16, '환상적인데': 17, '정말': 18, '좋은거': 19, '같아': 20}
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53

In [None]:
!pip install jpype1
!pip install konlpy

## Konlpy 예제

In [None]:
# 책 p.72 ~ 79 예제
# konlpy 예시.
# -----------------
from konlpy.tag import Okt

text = "한글 자연어 처리는 재밌다 이제부터 열심히 해야지 ㅎㅎㅎ"

okt = Okt()
print(okt.morphs(text))
print(okt.morphs(text, stem=True)) # 형태소 단위로 나눈 후 어간을 추출
print(okt.nouns(text))   # 명사만 추출
print(okt.phrases(text)) # 어절 단위로 나눠서 추출
print(okt.pos(text))     # 형태소 표시
print(okt.pos(text, join=True)) # 형태소와 품사를 붙여서 리스트로 표시

# konlpy 데이터
from konlpy.corpus import kolaw
from konlpy.corpus import kobill

kolaw.open('constitution.txt').read()[:20]
kobill.open('1809890.txt').read()

['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '해야지', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '하다', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '이제']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']
[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재밌다', 'Adjective'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해야지', 'Verb'), ('ㅎㅎㅎ', 'KoreanParticle')]
['한글/Noun', '자연어/Noun', '처리/Noun', '는/Josa', '재밌다/Adjective', '이제/Noun', '부터/Josa', '열심히/Adverb', '해야지/Verb', 'ㅎㅎㅎ/KoreanParticle']


'지방공무원법 일부개정법률안\n\n(정의화의원 대표발의 )\n\n 의 안\n 번 호\n\n9890\n\n발의연월일 : 2010.  11.  12.  \n\n발  의  자 : 정의화․이명수․김을동 \n\n이사철․여상규․안규백\n\n황영철․박영아․김정훈\n\n김학송 의원(10인)\n\n제안이유 및 주요내용\n\n  초등학교 저학년의 경우에도 부모의 따뜻한 사랑과 보살핌이 필요\n\n한 나이이나, 현재 공무원이 자녀를 양육하기 위하여 육아휴직을 할 \n\n수 있는 자녀의 나이는 만 6세 이하로 되어 있어 초등학교 저학년인 \n\n자녀를 돌보기 위해서는 해당 부모님은 일자리를 그만 두어야 하고 \n\n이는 곧 출산의욕을 저하시키는 문제로 이어질 수 있을 것임.\n\n  따라서 육아휴직이 가능한 자녀의 연령을 만 8세 이하로 개정하려\n\n는 것임(안 제63조제2항제4호).\n\n- 1 -\n\n\x0c법률  제        호\n\n지방공무원법 일부개정법률안\n\n지방공무원법 일부를 다음과 같이 개정한다.\n\n제63조제2항제4호 중 “만 6세 이하의 초등학교 취학 전 자녀를”을 “만 \n\n8세 이하(취학 중인 경우에는 초등학교 2학년 이하를 말한다)의 자녀를”\n\n로 한다.\n\n부      칙\n\n이 법은 공포한 날부터 시행한다.\n\n- 3 -\n\n\x0c신 ·구조문대비표\n\n현      행\n\n개   정   안\n\n제63조(휴직) ① (생  략)\n\n제63조(휴직) ① (현행과 같음)\n\n  ② 공무원이 다음 각 호의 어\n\n  ② -------------------------\n\n느 하나에 해당하는 사유로 휴\n\n----------------------------\n\n직을 원하면 임용권자는 휴직\n\n----------------------------\n\n을 명할 수 있다. 다만, 제4호\n\n-------------.---------------\n\n의 경우에는 대통령령으로 정\n\n---------------------------