In [1]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, Conv1D, GlobalAveragePooling1D
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils import np_utils

import warnings 
warnings.filterwarnings(action='ignore')

import gc
import xgboost as xgb
from konlpy.tag import Mecab  
from nltk import FreqDist

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
from tqdm import tqdm, tqdm_notebook

from KoBERT.kobert.utils import get_tokenizer
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split

##GPU 사용 시
device = torch.device("cuda:0")

# bertmodel, vocab = get_pytorch_kobert_model()

In [6]:
train = pd.read_csv("data/train_data.csv",encoding="utf-8",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)
submission = pd.read_csv("data/sample_submission.csv")

In [22]:
vocab_size = 5003
embedding_dim = 200  
padding_type='post'

In [7]:
NUM_TRAIN_DATA = len(train)
NUM_TEST_DATA = len(test)
NUM_CLASSES = 7
VOCAB_SIZE = 5000

In [24]:
def preprocessing_lstm(train_df, test_df, vocab_size):
    tokenizer = Mecab()
    train_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in train_df["title"]]
    test_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in test_df["title"]]
    
    vocab = FreqDist(np.hstack(train_df["tokenized"]))
    print('단어 집합의 크기 : {}'.format(len(vocab)))
    
    vocab = vocab.most_common(vocab_size)
    
    word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
    word_to_index['pad'] = 1
    word_to_index['unk'] = 0
    
    train_x = []
    test_x = []

    for line in train["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        train_x.append(temp)

    for line in test["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        test_x.append(temp)
        
    max_len = max(len(l) for l in train_x)
    max_len = max(len(l) for l in test_x)
    print(max_len)
    
    for line in train_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))

    for line in test_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))
            
#     train_y = np_utils.to_categorical(train["topic_idx"])
    train_y = train["topic_idx"]
    
    train_x = np.array(train_x)
    test_x = np.array(test_x)
    
    return train_x, train_y, test_x, max_len
    

In [25]:
def get_lstm_feature(train_df, test_df,rnd=1):
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'lstm_model.h5'
    NUM_CLASSES = 7
    
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_CLASSES)),np.zeros((NUM_TEST_DATA,NUM_CLASSES))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_CLASSES)),np.zeros((NUM_TEST_DATA,NUM_CLASSES))
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64)),
                Dense(NUM_CLASSES, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        model.summary()

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [26]:
lstm_train1, lstm_test1, lstm_train2, lstm_test2 = get_lstm_feature(train, test)

단어 집합의 크기 : 30903
26
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 26, 200)           2000400   
_________________________________________________________________
bidirectional (Bidirectional (None, 26, 128)           135680    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 26, 128)           98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 7)                 903       
Total params: 2,334,615
Trainable params: 2,334,615
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.61113, saving model to lstm_model.


Epoch 00001: val_loss improved from inf to 0.65824, saving model to lstm_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.65824 to 0.55976, saving model to lstm_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.55976
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.55976
------------------


In [27]:
def get_dnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.GlobalAveragePooling1D(),
                Dense(128,activation="relu"),
                Dense(128,activation="relu"),
                Dropout(0.2),
                Dense(7, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [28]:
dnn_train1, dnn_test1, dnn_train2, dnn_test2 = get_dnn_feature(train, test)

NameError: name 'NUM_LABELS' is not defined

In [None]:
def get_cnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential()
        model.add(Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len))
        model.add(Conv1D(32,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [None]:
cnn_train1, cnn_test1, cnn_train2, cnn_test2 = get_cnn_feature(train, test)

In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i]) for i in dataset[sent_key]]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int32(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int32(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            return (self.sentences[i] + (self.labels[i], ))
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=NUM_CLASSES,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
    
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [9]:
def get_kobert_features(rnd=1):
    
    FEAT_CNT = 5
    MODEL_P = 'kobert-model.pth'
    NUM_CLASSES = 7
    NUM_LABELS = 7
    RANDOM_SEED = 42
    
    max_len = 40
    batch_size = 16
    warmup_ratio = 0.1
    num_epochs = 1
    max_grad_norm = 1
    log_interval = 200
    learning_rate =  5e-5

    epochs_no_improve = 0
    min_val_loss = np.Inf
    n_epochs_stop = 2
    
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    dataset = pd.read_csv("data/train_data.csv",index_col=False)
    test = pd.read_csv("data/test_data.csv",index_col=False)
    dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)
    
    bertmodel, vocab = get_pytorch_kobert_model()    
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    
    data_total = BERTDataset(dataset_train, "title", "topic_idx", tok, max_len, True, False)
    data_train = BERTDataset(dataset_train, "title", "topic_idx", tok, max_len, True, False)
    data_val = BERTDataset(dataset_val, "title", "topic_idx", tok, max_len, True, False)
    data_test = BERTDataset(test, "title", None, tok, max_len, True, False)
    
    total_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    val_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, num_workers=5)
    test_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size,num_workers=5)
    
    model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    
    for e in range(num_epochs):
        train_acc = 0.0
        test_acc = 0.0
        model.train()
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
            optimizer.zero_grad()

            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length= valid_length

            label = label.long().to(device)

            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
        print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

        model.eval()
        val_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):

            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length= valid_length
            label = label.long().to(device)

            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out,label)

            val_loss += loss.data.cpu().numpy()
            test_acc += calc_accuracy(out, label)

        print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
        print("epoch {} val_loss {}".format(e+1, val_loss / (batch_id+1)))

        if val_loss < min_val_loss:
            torch.save(model, MODEL_P)
            epochs_no_improve = 0
            min_val_loss = val_loss
        else :
            epochs_no_improve += 1

        if epochs_no_improve == n_epochs_stop:
            print('Early stopping!' )
            break
        else:
            print("Keep going!")
            continue
            
        
    model.eval()
    
    outs = []
    for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        out = model(token_ids, valid_length, segment_ids)

        for o in out.detach():
            outs.append(o)
    test_pred += outs
    
    outs = []
    for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(total_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        out = model(token_ids, valid_length, segment_ids)

        for o in out.detach():
            outs.append(o)
    train_pred += outs
    
    model = torch.load(MODEL_P)
    model.eval()
    
    outs = []
    for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        out = model(token_ids, valid_length, segment_ids)

        for o in out.detach():
            outs.append(o)
    best_val_test_pred += outs
    
    outs = []
    for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(total_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        out = model(token_ids, valid_length, segment_ids)

        for o in out.detach():
            outs.append(o)
    best_val_train_pred += outs
            
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [10]:
kobert_train1, kobert_test1, kobert_train2, kobert_test2 = get_kobert_features()

using cached model
using cached model
using cached model


RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 803: system has unsupported display driver / cuda driver combination