In [9]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, Conv1D, GlobalAveragePooling1D
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils import np_utils

import warnings 
warnings.filterwarnings(action='ignore')

import gc
import xgboost as xgb

from nltk import FreqDist
from konlpy.tag import Mecab  


In [2]:
train = pd.read_csv("data/train_data.csv",encoding="utf-8",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)
submission = pd.read_csv("data/sample_submission.csv")

In [3]:
from konlpy.tag import Mecab  
tokenizer = Mecab()
train["tokenized"] = [tokenizer.morphs(sentence) for sentence in train["title"]]
test["tokenized"] = [tokenizer.morphs(sentence) for sentence in test["title"]]

In [4]:
vocab_list = pd.concat([train,test])
vocab_list = vocab_list["tokenized"]
vocab_list[:20]

0                [인천, →, 핀란드, 항공기, 결항, …, 휴가철, 여행객, 분통]
1     [실리콘밸리, 넘어서, 겠, 다, …, 구글, 15, 조, 원, 들여, 美, 전역,...
2     [이란, 외무, 긴장, 완화, 해결책, 은, 미국, 이, 경제, 전쟁, 멈추, 는, 것]
3     [NYT, 클린턴, 측근, 韓, 기업, 특수, 관계, 조명, …, 공과, 사, 맞물...
4                 [시진핑, 트럼프, 에, 중미, 무역, 협상, 조속, 타결, 희망]
5     [팔레스타인, 가, 자, 지, 구서, 16, 세, 소년, 이스라엘, 군, 총격, 에...
6     [인도, 48, 년, 만, 에, 파키스탄, 공습, …, 테러, 캠프, 폭격, 종합,...
7     [美, 대선, TV, 토론, 음담패설, 만회, 실패, 트럼프, …, 사과, 대신, ...
8            [푸틴, 한반도, 상황, 진전, 위한, 방안, 김정은, 위원장, 과, 논의]
9     [특검, 면죄부, 받, 은, 트럼프, 스캔들, 보도, 언론, 맹공, …, 국민, 의...
10                   [日, 오키, 나와서, 열린, 강제, 징용, 노동자, 추도식]
11      [이란, 서, 최고, 지도자, 모욕, 혐의, 미국인, 에, 징역, 10, 년, 선고]
12    [카니발, 축제, 보, 러, 가, 자, …, 브라질, 리우, 에, 대형, 유람선, 행렬]
13         [美, 올랜도, 병원, 최악, 총기, 테러, 부상자, 치료비, 안, 받, 는다]
14              [日, 대, 기업, 올해, 평균, 2, ., 46, %, 임금, 인상]
15           [WMO, 엘니뇨, 여전히, 강력, …, 2, 분기, 엔, 소멸, 될, 듯]
16          [이스라엘, 네타냐후, 유대교, 도, 병역, 문제, 로, 연정, 협상, 진통]
17         [UAE, 사우디, 이, 어, 美, 호르무즈, 호위, 연합, 에, 

In [5]:
from nltk import FreqDist
vocab = FreqDist(np.hstack(vocab_list))
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 31861


In [6]:
vocab_size = 5000
# 상위 vocab_size개의 단어만 보존
vocab = vocab.most_common(vocab_size)
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 5000


In [7]:
word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
word_to_index['pad'] = 1
word_to_index['unk'] = 0

In [8]:
train_x = []
test_x = []

for line in train["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    train_x.append(temp)

for line in test["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    test_x.append(temp)

In [9]:
max_len = max(len(l) for l in train_x)
max_len = max(len(l) for l in test_x)
print(max_len)

26


In [10]:
for line in train_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.
        
for line in test_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.

In [11]:
train_y = np_utils.to_categorical(train["topic_idx"]) # Y_train 에 원-핫 인코딩

In [12]:
train_x = np.array(train_x)
test_x = np.array(test_x)

In [5]:
vocab_size = 5003
embedding_dim = 200  
max_length = 26
padding_type='post'

In [14]:
NUM_CLASSES = 7
NUM_LABELS = 7
NUM_TRAIN_DATA = len(train)
NUM_TEST_DATA = len(test)
VOCAB_SIZE = 5000

In [10]:
def preprocessing_lstm(train_df, test_df, vocab_size):
    tokenizer = Mecab()
    train_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in train_df["title"]]
    test_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in test_df["title"]]
    
    vocab = FreqDist(np.hstack(train_df["tokenized"]))
    print('단어 집합의 크기 : {}'.format(len(vocab)))
    
    vocab = vocab.most_common(vocab_size)
    
    word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
    word_to_index['pad'] = 1
    word_to_index['unk'] = 0
    
    train_x = []
    test_x = []

    for line in train["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        train_x.append(temp)

    for line in test["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        test_x.append(temp)
        
    max_len = max(len(l) for l in train_x)
    max_len = max(len(l) for l in test_x)
    print(max_len)
    
    for line in train_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))

    for line in test_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))
            
#     train_y = np_utils.to_categorical(train["topic_idx"])
    train_y = train["topic_idx"]
    
    train_x = np.array(train_x)
    test_x = np.array(test_x)
    
    return train_x, train_y, test_x, max_len
    

In [11]:
train_x, train_y, test_x, max_len = preprocessing_lstm(train, test, 5000)

단어 집합의 크기 : 30903
26


In [15]:
def get_lstm_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'lstm_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64)),
                Dense(NUM_CLASSES, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        model.summary()

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [16]:
lstm_train1, lstm_test1, lstm_train2, lstm_test2 = get_lstm_feature(train, test)

단어 집합의 크기 : 30903
26
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 26, 200)           2000400   
_________________________________________________________________
bidirectional (Bidirectional (None, 26, 128)           135680    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 26, 128)           98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 7)                 903       
Total params: 2,334,615
Trainable params: 2,334,615
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.63882, saving model to lstm_model.


Epoch 00001: val_loss improved from inf to 0.66950, saving model to lstm_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.66950 to 0.56093, saving model to lstm_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.56093
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.56093
------------------


In [17]:
def get_dnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.GlobalAveragePooling1D(),
                Dense(128,activation="relu"),
                Dense(128,activation="relu"),
                Dropout(0.2),
                Dense(7, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [18]:
dnn_train1, dnn_test1, dnn_train2, dnn_test2 = get_dnn_feature(train, test)

단어 집합의 크기 : 30903
26
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.75884, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.75884 to 0.57059, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.57059 to 0.54372, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.54372
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.54372
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.64982, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.64982 to 0.50349, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.50349 to 0.50302, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.50302
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.50302
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.83348, saving model to dnn_model.h5
Epoch 2/10

Epoch 0000

In [19]:
def get_cnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential()
        model.add(Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len))
        model.add(Conv1D(32,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [20]:
cnn_train1, cnn_test1, cnn_train2, cnn_test2 = get_cnn_feature(train, test)

단어 집합의 크기 : 30903
26
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.05923, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.05923 to 0.68043, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.68043 to 0.59587, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.59587 to 0.57316, saving model to dnn_model.h5
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.57316
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.57316
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.04626, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.04626 to 0.66506, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.66506 to 0.58472, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.58472 to 0.55385, saving model to dnn_model.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.55385 to 0.54962

In [84]:
kobert_train1 = pd.read_csv("results/kobert_train_pred.csv")
kobert_test1 = pd.read_csv("results/kobert_test_pred.csv")
kobert_train2 = pd.read_csv("results/kobert_best_train_pred.csv")
kobert_test2 = pd.read_csv("results/kobert_best_test_pred.csv")

In [85]:
kobert_test1

Unnamed: 0,0,1,2,3,4,5,6
0,7.342314,-0.205254,1.046838,0.006421,-2.038419,-3.073816,-2.568758
1,-1.420760,-2.314045,-0.398929,8.449143,-1.850893,-1.624655,0.115052
2,0.071454,0.151517,7.751219,-1.214466,-3.102966,-3.573472,-0.161871
3,6.981777,-1.558864,1.980390,-0.964467,-1.347920,-3.438751,-1.385832
4,-0.993988,-1.011461,0.392555,8.746573,-2.075460,-2.360103,-1.962877
...,...,...,...,...,...,...,...
9126,-1.439479,-1.662237,0.433224,8.780240,-1.937048,-2.112880,-1.229745
9127,-1.659509,-1.497266,8.066822,-1.182720,-0.850516,-2.286092,-0.614393
9128,-2.300084,-1.230400,4.157038,6.802824,-1.551959,-2.549319,-2.754171
9129,-0.369820,-1.331334,7.999159,-0.094570,-2.301545,-2.433198,-1.426672


In [55]:
# kobert_train1[["0","1","2","3","4","5","6"]] = kobert_train1[["0","1","2","3","4","5","6"]].apply(pd.to_numeric)
# # kobert_test1 = pd.read_csv("results/kobert_test_pred.csv")
# # kobert_train2 = pd.read_csv("results/kobert_best_train_pred.csv")
# # kobert_test2 = pd.read_csv("results/kobert_best_test_pred.csv")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [57]:
# kobert_test1 = kobert_test1.drop(["Unnamed: 0"],axis=1)
# kobert_train2 = kobert_train2.drop(["Unnamed: 0"],axis=1)
# kobert_test2 = kobert_test2.drop(['Unnamed: 0'],axis=1)

In [86]:
kobert_train1 = kobert_train1.values
kobert_test1 = kobert_test1.values
kobert_train2 = kobert_train2.values
kobert_test2 = kobert_test2.values

In [68]:
# kobert_train1 = np.array(kobert_train1).astype(np.float)
# kobert_test1 = np.array(kobert_test1).astype(np.float)
# kobert_train2 = np.array(kobert_train2).astype(np.float)
# kobert_test2 = np.array(kobert_test2).astype(np.float)

ValueError: could not convert string to float: "tensor(-2.4922, device='cuda:0')"

In [87]:
all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                        dnn_train1, dnn_train2,
                          cnn_train1, cnn_train2,
                          kobert_train1, kobert_train2
                        ])

all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                        dnn_test1, dnn_test2,
                         cnn_test1, cnn_test2,
                         kobert_test1, kobert_test2
                        ])

In [88]:
cols_to_drop = ['index', 'title','tokenized']
train_X = train.drop(cols_to_drop+['topic_idx'], axis=1).values
test_X = test.drop(cols_to_drop, axis=1).values


# print(f_train_X.shape, f_test_X.shape)

In [89]:
train_X = np.hstack([train_X, all_nn_train])
test_X = np.hstack([test_X, all_nn_test])

In [90]:
print(train_X.shape, test_X.shape)

(45654, 56) (9131, 56)


In [91]:
rnd = 42
k_cnt = 5

kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)

test_pred = None
weighted_test_pred = None
org_train_pred = None
avg_k_score = 0
reverse_score = 0
best_loss = 100
best_single_pred = None

train_Y = train_y

for train_index, test_index in kf.split(train_X,train_Y):
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = train_Y[train_index], train_Y[test_index]
    
    params = {
            'colsample_bytree': 0.7,
            'subsample': 0.8,
            'eta': 0.04,
            'max_depth': 3,
            'eval_metric':'mlogloss',
            'objective':'multi:softprob',
            'num_class':7,
            'tree_method':'gpu_hist'
    }
    
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_test, y_test)
    d_test = xgb.DMatrix(test_X)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    m = xgb.train(params, d_train, 2000, watchlist, 
                    early_stopping_rounds=50,
                    verbose_eval=200)
    
    train_pred = m.predict(d_train)
    valid_pred = m.predict(d_valid)
    tmp_train_pred = m.predict(xgb.DMatrix(train_X))
    
    train_score = log_loss(y_train,train_pred)
    valid_score = log_loss(y_test,valid_pred)
    print('train log loss',train_score,'valid log loss',valid_score)
    avg_k_score += valid_score
    rev_valid_score = 1.0/valid_score
    reverse_score += rev_valid_score
    print('rev',rev_valid_score)
    
    if test_pred is None:
        test_pred = m.predict(d_test)
        weighted_test_pred = test_pred*rev_valid_score
        org_train_pred = tmp_train_pred
        best_loss = valid_score
        best_single_pred = test_pred
    else:
        curr_pred = m.predict(d_test)
        test_pred += curr_pred
        weighted_test_pred += curr_pred*rev_valid_score
        org_train_pred += tmp_train_pred

        if valid_score < best_loss:
            print('BETTER')
            best_loss = valid_score
            best_single_pred = curr_pred

test_pred = test_pred / k_cnt
test_pred = np.round(test_pred,4)
org_train_pred = org_train_pred / k_cnt
avg_k_score = avg_k_score/k_cnt

submiss=pd.read_csv("data/sample_submission.csv")
submiss['0']=test_pred[:,0]
submiss['1']=test_pred[:,1]
submiss['2']=test_pred[:,2]
submiss['3']=test_pred[:,3]
submiss['4']=test_pred[:,4]
submiss['5']=test_pred[:,5]
submiss['6']=test_pred[:,6]
submiss.to_csv("results/xgb_{}_3.csv".format(k_cnt),index=False)
print(reverse_score)

# weigthed
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = weighted_test_pred / reverse_score
weighted_test_pred = np.round(weighted_test_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['5']=weighted_test_pred[:,5]
submiss['6']=weighted_test_pred[:,6]
submiss.to_csv("results/weighted_{}_3.csv".format(k_cnt),index=False)

# best single
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = np.round(best_single_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['4']=weighted_test_pred[:,5]
submiss['4']=weighted_test_pred[:,6]
submiss.to_csv("results/single_{}_3.csv".format(k_cnt),index=False)

# train log loss
print('local average valid loss',avg_k_score)
print('train log loss', log_loss(train_Y,org_train_pred))


[0]	train-mlogloss:1.81084	valid-mlogloss:1.81103
[200]	train-mlogloss:0.03099	valid-mlogloss:0.05008
[257]	train-mlogloss:0.02579	valid-mlogloss:0.05039
train log loss 0.025787835209833437 valid log loss 0.0503856437130458
rev 19.846923177069208
[0]	train-mlogloss:1.81082	valid-mlogloss:1.81087
[200]	train-mlogloss:0.03140	valid-mlogloss:0.04789
[266]	train-mlogloss:0.02546	valid-mlogloss:0.04799
train log loss 0.025461232586177083 valid log loss 0.04799155942389039
rev 20.83699742213828
BETTER
[0]	train-mlogloss:1.81089	valid-mlogloss:1.81098
[200]	train-mlogloss:0.03325	valid-mlogloss:0.04202
[292]	train-mlogloss:0.02502	valid-mlogloss:0.04184
train log loss 0.02502236755339625 valid log loss 0.0418393163584838
rev 23.900964141763016
BETTER
[0]	train-mlogloss:1.81090	valid-mlogloss:1.81103
[200]	train-mlogloss:0.03234	valid-mlogloss:0.04508
[276]	train-mlogloss:0.02539	valid-mlogloss:0.04531
train log loss 0.025392708434050164 valid log loss 0.045310528874580105
rev 22.0699255744290

In [92]:
single = pd.read_csv("results/single_5_2.csv")
weighted = pd.read_csv("results/weighted_5_2.csv")
xgboosted = pd.read_csv("results/xgb_5_2.csv")

In [93]:
idx_list = []
for i in xgboosted.index:
    val = xgboosted.loc[i,["0","1","2","3","4","5","6"]]
    val.to_list()
    idx = np.argmax(val)
    idx_list.append(idx)


In [94]:
submiss=pd.read_csv("data/sample_submission.csv")
submiss["topic_idx"] = idx_list

In [95]:
submiss.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [96]:
submiss.to_csv("results/xgboost_3.csv",index=False)

In [80]:
idx_list = []
for i in weighted.index:
    val = weighted.loc[i,["0","1","2","3","4","5","6"]]
    val.to_list()
    idx = np.argmax(val)
    idx_list.append(idx)


In [81]:
submiss=pd.read_csv("data/sample_submission.csv")
submiss["topic_idx"] = idx_list

In [82]:
submiss.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [83]:
submiss.to_csv("results/weighted_2.csv",index=False)