In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import * 
from tensorflow.keras.callbacks import * 
from tensorflow.keras.models import * 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from konlpy.tag import Mecab, Okt 
import re

In [2]:
train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv') 
train.shape, test.shape, submission.shape

((174304, 13), (43576, 12), (43576, 2))

In [3]:
train = train[['사업명','사업_부처명','내역사업명','과제명','요약문_연구목표','요약문_연구내용','요약문_기대효과','요약문_한글키워드','label']]

test = test[['사업명','사업_부처명','내역사업명','과제명','요약문_연구목표','요약문_연구내용','요약문_기대효과','요약문_한글키워드']]


In [4]:
def preprocessing(text, mecab, remove_stopwords=False, stop_words=[]):
    text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ]","", text)
    word_text = mecab.morphs(text)
    if remove_stopwords:
        word_review = [token for token in word_text if not token in stop_words]
    return word_review


# 과제명 

In [5]:
stop_words=['은','는','이','가', '하','아','것','들','의','있','되','수','보','주','등','한']
mecab = Mecab()
clean_train1 = []
clean_test1 = []

In [6]:
train1 = train['과제명'].values 
for text in tqdm(train1):
    try:
        clean_train1.append(preprocessing(text, mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e)
        clean_train1.append([])

100%|██████████| 174304/174304 [00:16<00:00, 10820.63it/s]


In [7]:
test1 = test['과제명'].values 
for text in tqdm(test1):
    try:
        clean_test1.append(preprocessing(text, mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e)
        clean_test1.append([])

100%|██████████| 43576/43576 [00:04<00:00, 9982.74it/s] 


In [8]:
clean_train1 = np.asarray(clean_train1) 
clean_test1 = np.asarray(clean_test1)

  return array(a, dtype, copy=False, order=order)


In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train1)  
train1 = tokenizer.texts_to_sequences(clean_train1) 

test1 = tokenizer.texts_to_sequences(clean_test1)

In [10]:
max_len = -1
for i in range(len(train1)): 
    if len(train1[i]) > max_len: 
        max_len = len(train1[i])
        
max_len

52

In [11]:
train1=pad_sequences(train1, maxlen=max_len+5, padding='post')
test1=pad_sequences(test1, maxlen=max_len+5, padding='post')

In [12]:
train1.shape, test1.shape

((174304, 57), (43576, 57))

In [13]:
vocab1 = len(tokenizer.word_index)+1
vocab1

35816

# 요약문_연구목표

In [43]:
clean_train2 = [] 
clean_test2 = [] 

In [44]:
train2 = train['요약문_연구목표'].values 
for text in tqdm(train2):
    try:
        clean_train2.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_train2.append([])

100%|██████████| 174304/174304 [01:49<00:00, 1595.46it/s]


In [45]:
test2 = test['요약문_연구목표'].values 
for text in tqdm(test2):
    try:
        clean_test2.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_test2.append([])

100%|██████████| 43576/43576 [00:26<00:00, 1654.65it/s]


In [84]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train2)  
train2 = tokenizer.texts_to_sequences(clean_train2) 

test2 = tokenizer.texts_to_sequences(clean_test2)

In [85]:
max_len = -1
for i in range(len(train2)): 
    if len(train2[i]) > max_len: 
        max_len = len(train2[i])
        
max_len

809

In [86]:
train2 = pad_sequences(train2, maxlen=max_len+5, padding='post')
test2 = pad_sequences(test2, maxlen=max_len+5, padding='post')

In [116]:
train2.shape

(174304, 814)

In [87]:
vocab2 = len(tokenizer.word_index) + 1
vocab2

74479

# 요약문_연구내용

In [49]:
clean_train3 = [] 
clean_test3 = [] 

In [50]:
train3 = train['요약문_연구내용'].values 
for text in tqdm(train3):
    try:
        clean_train3.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_train3.append([])

100%|██████████| 174304/174304 [03:17<00:00, 884.52it/s] 


In [51]:
test3 = test['요약문_연구내용'].values 
for text in tqdm(test3):
    try:
        clean_test3.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_test3.append([])

100%|██████████| 43576/43576 [00:51<00:00, 846.03it/s]


In [93]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train3)  
train3 = tokenizer.texts_to_sequences(clean_train3) 

test3 = tokenizer.texts_to_sequences(clean_test3)

In [94]:
max_len = -1
for i in range(len(train3)): 
    if len(train3[i]) > max_len: 
        max_len = len(train3[i])
        
max_len

896

In [95]:
train3 = pad_sequences(train3, maxlen=max_len+5, padding='post')
test3 = pad_sequences(test3, maxlen=max_len+5, padding='post')

In [96]:
vocab3 = len(tokenizer.word_index) + 1 
vocab3

94658

# 요약문_기대효과

In [55]:
clean_train4 = [] 
clean_test4 = [] 

In [56]:
train4 = train['요약문_기대효과'].values 
for text in tqdm(train4):
    try:
        clean_train4.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_train4.append([])

100%|██████████| 174304/174304 [02:23<00:00, 1215.28it/s]


In [57]:
test4 = test['요약문_기대효과'].values 
for text in tqdm(test4):
    try:
        clean_test4.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_test4.append([])

100%|██████████| 43576/43576 [00:34<00:00, 1255.78it/s]


In [97]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train4)  
train4 = tokenizer.texts_to_sequences(clean_train4) 

test4 = tokenizer.texts_to_sequences(clean_test4)

In [98]:
max_len = -1
for i in range(len(train4)): 
    if len(train4[i]) > max_len: 
        max_len = len(train4[i])
        
max_len

893

In [99]:
train4 = pad_sequences(train4, maxlen=max_len+5, padding='post')
test4 = pad_sequences(test4, maxlen=max_len+5, padding='post')

In [100]:
vocab4 = len(tokenizer.word_index) + 1 
vocab4 

71163

# 요약문_한글키워드

In [14]:
clean_train5 = [] 
clean_test5 = [] 

In [15]:
train5 = train['요약문_한글키워드'].values 
for text in tqdm(train5):
    try:
        clean_train5.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_train5.append([])

100%|██████████| 174304/174304 [00:13<00:00, 13127.95it/s]


In [16]:
test5 = test['요약문_한글키워드'].values 
for text in tqdm(test5):
    try:
        clean_test5.append(preprocessing(str(text), mecab, remove_stopwords=True, stop_words=stop_words))
    except Exception as e:
        print(e) 
        clean_test5.append([])

100%|██████████| 43576/43576 [00:03<00:00, 13688.76it/s]


In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train5)  
train5 = tokenizer.texts_to_sequences(clean_train5) 

test5 = tokenizer.texts_to_sequences(clean_test5)

In [18]:
max_len = -1
for i in range(len(train5)): 
    if len(train5[i]) > max_len: 
        max_len = len(train5[i])
        
max_len

91

In [19]:
train5 = pad_sequences(train5, maxlen=max_len+5, padding='post')
test5 = pad_sequences(test5, maxlen=max_len+5, padding='post')

In [20]:
vocab5 = len(tokenizer.word_index) + 1 
vocab5

46067

In [21]:
train5.shape

(174304, 96)

In [29]:
y_train = train['label'].values

# LSTM model

In [26]:
def build_model(): 
    inputs1 = Input((57))
    embedding1 = Embedding(vocab1, 32, input_length=57)(inputs1)
    lstm1 = Bidirectional(LSTM(128, return_sequences=False))(embedding1)  
    
    inputs2 = Input((96))
    embedding2 = Embedding(vocab5, 256, input_length=96)(inputs2)
    lstm2 = Bidirectional(LSTM(128, return_sequences=False))(embedding2)

    concat = Concatenate()([lstm1,lstm2]) 
    dense = Dense(64, activation = 'relu')(concat)
    dense = BatchNormalization()(dense)
    dense = Dense(46, activation = 'softmax')(dense) 
    model = Model(inputs=[inputs1,inputs2], outputs=dense) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model
    

In [27]:
model = build_model() 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 57)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 57, 32)       1146112     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 96, 256)      11793152    input_6[0][0]                    
______________________________________________________________________________________________

In [30]:
model_path = 'Bi_LSTM_epoch_{epoch:03d}_val_{val_loss:.10f}_accuracy_{val_accuracy:.10f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.9)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_accuracy', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 

history = model.fit(x=[train1,train5],
                    y=y_train,
                    batch_size=32,
                    epochs=200,
                    validation_split = 0.1, 
                    callbacks = [learning_rate_reduction,checkpoint,early_stopping])


Epoch 1/200

Epoch 00001: val_accuracy improved from -inf to 0.73008, saving model to Bi_LSTM_epoch_001_val_1.1263858080_accuracy_0.7300785780.h5
Epoch 2/200
  28/4903 [..............................] - ETA: 11:27 - loss: 0.2389 - accuracy: 0.9342

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00002: val_accuracy improved from 0.73008 to 0.88624, saving model to Bi_LSTM_epoch_002_val_0.4442852736_accuracy_0.8862371445.h5
Epoch 3/200

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00005: val_accuracy did not improve from 0.88624
Epoch 6/200
 247/4903 [>.............................] - ETA: 11:34 - loss: 0.0680 - accuracy: 0.9785

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00010: val_accuracy did not improve from 0.92353
Epoch 11/200

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00013: val_accuracy improved from 0.92353 to 0.92387, saving model to Bi_LSTM_epoch_013_val_0.4711167514_accuracy_0.9238712788.h5
Epoch 14/200
 796/4903 [===>..........................] - ETA: 10:18 - loss: 0.0301 - accuracy: 0.9898

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0005904900433961303.

Epoch 00014: val_accuracy did not improve from 0.92387
Epoch 15/200
 541/4903 [==>...........................] - ETA: 11:10 - loss: 0.0297 - accuracy: 0.9900

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00047829695977270604.

Epoch 00018: val_accuracy did not improve from 0.92485
Epoch 19/200

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0004304672533180565.

Epoch 00020: val_accuracy did not improve from 0.92485
