In [62]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, Conv1D, GlobalAveragePooling1D
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils import np_utils

import warnings 
warnings.filterwarnings(action='ignore')

import gc
import xgboost as xgb

In [2]:
train = pd.read_csv("data/train_data.csv",encoding="utf-8",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)
submission = pd.read_csv("data/sample_submission.csv")

In [3]:
from konlpy.tag import Mecab  
tokenizer = Mecab()
train["tokenized"] = [tokenizer.morphs(sentence) for sentence in train["title"]]
test["tokenized"] = [tokenizer.morphs(sentence) for sentence in test["title"]]

In [4]:
vocab_list = pd.concat([train,test])
vocab_list = vocab_list["tokenized"]
vocab_list[:20]

0                [인천, →, 핀란드, 항공기, 결항, …, 휴가철, 여행객, 분통]
1     [실리콘밸리, 넘어서, 겠, 다, …, 구글, 15, 조, 원, 들여, 美, 전역,...
2     [이란, 외무, 긴장, 완화, 해결책, 은, 미국, 이, 경제, 전쟁, 멈추, 는, 것]
3     [NYT, 클린턴, 측근, 韓, 기업, 특수, 관계, 조명, …, 공과, 사, 맞물...
4                 [시진핑, 트럼프, 에, 중미, 무역, 협상, 조속, 타결, 희망]
5     [팔레스타인, 가, 자, 지, 구서, 16, 세, 소년, 이스라엘, 군, 총격, 에...
6     [인도, 48, 년, 만, 에, 파키스탄, 공습, …, 테러, 캠프, 폭격, 종합,...
7     [美, 대선, TV, 토론, 음담패설, 만회, 실패, 트럼프, …, 사과, 대신, ...
8            [푸틴, 한반도, 상황, 진전, 위한, 방안, 김정은, 위원장, 과, 논의]
9     [특검, 면죄부, 받, 은, 트럼프, 스캔들, 보도, 언론, 맹공, …, 국민, 의...
10                   [日, 오키, 나와서, 열린, 강제, 징용, 노동자, 추도식]
11      [이란, 서, 최고, 지도자, 모욕, 혐의, 미국인, 에, 징역, 10, 년, 선고]
12    [카니발, 축제, 보, 러, 가, 자, …, 브라질, 리우, 에, 대형, 유람선, 행렬]
13         [美, 올랜도, 병원, 최악, 총기, 테러, 부상자, 치료비, 안, 받, 는다]
14              [日, 대, 기업, 올해, 평균, 2, ., 46, %, 임금, 인상]
15           [WMO, 엘니뇨, 여전히, 강력, …, 2, 분기, 엔, 소멸, 될, 듯]
16          [이스라엘, 네타냐후, 유대교, 도, 병역, 문제, 로, 연정, 협상, 진통]
17         [UAE, 사우디, 이, 어, 美, 호르무즈, 호위, 연합, 에, 

In [5]:
from nltk import FreqDist
vocab = FreqDist(np.hstack(vocab_list))
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 31861


In [6]:
vocab_size = 5000
# 상위 vocab_size개의 단어만 보존
vocab = vocab.most_common(vocab_size)
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 5000


In [7]:
word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
word_to_index['pad'] = 1
word_to_index['unk'] = 0

In [8]:
train_x = []
test_x = []

for line in train["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    train_x.append(temp)

for line in test["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    test_x.append(temp)

In [9]:
max_len = max(len(l) for l in train_x)
max_len = max(len(l) for l in test_x)
print(max_len)

26


In [10]:
for line in train_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.
        
for line in test_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.

In [11]:
train_y = np_utils.to_categorical(train["topic_idx"]) # Y_train 에 원-핫 인코딩

In [12]:
train_x = np.array(train_x)
test_x = np.array(test_x)

In [13]:
vocab_size = 5003
embedding_dim = 200  
max_length = max_len
padding_type='post'

In [18]:
NUM_CLASSES = 7
NUM_TRAIN_DATA = len(train)
NUM_TEST_DATA = len(test)
VOCAB_SIZE = 5000

In [39]:
def preprocessing_lstm(train_df, test_df, vocab_size):
    tokenizer = Mecab()
    train_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in train_df["title"]]
    test_df["tokenized"] = [tokenizer.morphs(sentence) for sentence in test_df["title"]]
    
    vocab = FreqDist(np.hstack(train_df["tokenized"]))
    print('단어 집합의 크기 : {}'.format(len(vocab)))
    
    vocab = vocab.most_common(vocab_size)
    
    word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
    word_to_index['pad'] = 1
    word_to_index['unk'] = 0
    
    train_x = []
    test_x = []

    for line in train["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        train_x.append(temp)

    for line in test["tokenized"]:
        temp = []
        for w in line:
            try:
                temp.append(word_to_index[w])
            except KeyError:
                temp.append(word_to_index['unk'])

        test_x.append(temp)
        
    max_len = max(len(l) for l in train_x)
    max_len = max(len(l) for l in test_x)
    print(max_len)
    
    for line in train_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))

    for line in test_x:
        if len(line) < max_len:
            line += [word_to_index['pad']] * (max_len - len(line))
            
#     train_y = np_utils.to_categorical(train["topic_idx"])
    train_y = train["topic_idx"]
    
    train_x = np.array(train_x)
    test_x = np.array(test_x)
    
    return train_x, train_y, test_x, max_len
    

In [40]:
train_x, train_y, test_x, max_len = preprocessing_lstm(train, test, 5000)

단어 집합의 크기 : 30903
26


In [47]:
def get_lstm_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'lstm_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
                tf.keras.layers.Bidirectional(LSTM(units = 64)),
                Dense(NUM_CLASSES, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        model.summary()

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [48]:
lstm_train1, lstm_test1, lstm_train2, lstm_test2 = get_lstm_feature(train, test)

단어 집합의 크기 : 30903
26
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 26, 200)           2000400   
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 26, 128)           135680    
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 26, 128)           98816     
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_7 (Dense)              (None, 7)                 903       
Total params: 2,334,615
Trainable params: 2,334,615
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.58175, saving model to lstm_mode


Epoch 00001: val_loss improved from inf to 0.63464, saving model to lstm_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.63464 to 0.54841, saving model to lstm_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.54841
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.54841
------------------


In [53]:
def get_dnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential([Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len),
                tf.keras.layers.GlobalAveragePooling1D(),
                Dense(128,activation="relu"),
                Dense(128,activation="relu"),
                Dropout(0.2),
                Dense(7, activation='softmax')
            ])
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [54]:
dnn_train1, dnn_test1, dnn_train2, dnn_test2 = get_dnn_feature(train, test)

단어 집합의 크기 : 30903
26
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.69109, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.69109 to 0.57535, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.57535 to 0.56468, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.56468
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.56468
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.71615, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.71615 to 0.54140, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.54140 to 0.53950, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.53950
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.53950
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.72588, saving model to dnn_model.h5
Epoch 2/10

Epoch 0000

In [63]:
def get_cnn_feature(train_df, test_df,rnd=1):
    train_pred, test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    best_val_train_pred, best_val_test_pred = np.zeros((NUM_TRAIN_DATA,NUM_LABELS)),np.zeros((NUM_TEST_DATA,NUM_LABELS))
    
    FEAT_CNT = 5
    NUM_WORDS = 10000
    embedding_dim = 200
    MODEL_P = 'dnn_model.h5'
    NUM_CLASSES = 7
    
    train_x, train_y, test_x, max_len = preprocessing_lstm(train_df, test_df, NUM_WORDS)

    skf = StratifiedKFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    
    for train_index, test_index in skf.split(train_x,train_y):
        
        model = Sequential()
        model.add(Embedding(NUM_WORDS+2, embedding_dim, input_length=max_len))
        model.add(Conv1D(32,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        es = EarlyStopping(monitor='val_loss', patience=2)

        np.random.seed(42)
        model.fit(train_x[train_index], to_categorical(train_y[train_index]), 
                  validation_data=(train_x[test_index], to_categorical(train_y[test_index])),
                  batch_size=256, epochs=10,
                  verbose=1,
                  callbacks=[mc,es],
                  shuffle=False
                 )
        
        # feature 생성 1
        train_pred[test_index] = model.predict(train_x[test_index])
        test_pred += model.predict(test_x)/FEAT_CNT
        
        # feature 생성 2
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(train_x[test_index])
        best_val_test_pred += model.predict(test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

In [64]:
cnn_train1, cnn_test1, cnn_train2, cnn_test2 = get_cnn_feature(train, test)

단어 집합의 크기 : 30903
26
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.21708, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.21708 to 0.72043, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.72043 to 0.62686, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.62686 to 0.60078, saving model to dnn_model.h5
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.60078
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.60078
------------------
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.22004, saving model to dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.22004 to 0.77753, saving model to dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.77753 to 0.66067, saving model to dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.66067 to 0.61270, saving model to dnn_model.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.61270 to 0.59724

In [65]:
all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                        dnn_train1, dnn_train2,
                          cnn_train1, cnn_train2
                        ])

all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                        dnn_test1, dnn_test2,
                         cnn_test1, cnn_test2
                        ])

In [70]:
cols_to_drop = ['index', 'title','tokenized']
train_X = train.drop(cols_to_drop+['topic_idx'], axis=1).values
test_X = test.drop(cols_to_drop, axis=1).values


# print(f_train_X.shape, f_test_X.shape)

In [71]:
train_X = np.hstack([train_X, all_nn_train])
test_X = np.hstack([test_X, all_nn_test])

In [72]:
print(train_X.shape, test_X.shape)

(45654, 42) (9131, 42)


In [73]:
rnd = 42
k_cnt = 5

kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)

test_pred = None
weighted_test_pred = None
org_train_pred = None
avg_k_score = 0
reverse_score = 0
best_loss = 100
best_single_pred = None

train_Y = train_y

for train_index, test_index in kf.split(train_X,train_Y):
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = train_Y[train_index], train_Y[test_index]
    
    params = {
            'colsample_bytree': 0.7,
            'subsample': 0.8,
            'eta': 0.04,
            'max_depth': 3,
            'eval_metric':'mlogloss',
            'objective':'multi:softprob',
            'num_class':7,
            'tree_method':'gpu_hist'
    }
    
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_test, y_test)
    d_test = xgb.DMatrix(test_X)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    m = xgb.train(params, d_train, 2000, watchlist, 
                    early_stopping_rounds=50,
                    verbose_eval=200)
    
    train_pred = m.predict(d_train)
    valid_pred = m.predict(d_valid)
    tmp_train_pred = m.predict(xgb.DMatrix(train_X))
    
    train_score = log_loss(y_train,train_pred)
    valid_score = log_loss(y_test,valid_pred)
    print('train log loss',train_score,'valid log loss',valid_score)
    avg_k_score += valid_score
    rev_valid_score = 1.0/valid_score
    reverse_score += rev_valid_score
    print('rev',rev_valid_score)
    
    if test_pred is None:
        test_pred = m.predict(d_test)
        weighted_test_pred = test_pred*rev_valid_score
        org_train_pred = tmp_train_pred
        best_loss = valid_score
        best_single_pred = test_pred
    else:
        curr_pred = m.predict(d_test)
        test_pred += curr_pred
        weighted_test_pred += curr_pred*rev_valid_score
        org_train_pred += tmp_train_pred

        if valid_score < best_loss:
            print('BETTER')
            best_loss = valid_score
            best_single_pred = curr_pred

test_pred = test_pred / k_cnt
test_pred = np.round(test_pred,4)
org_train_pred = org_train_pred / k_cnt
avg_k_score = avg_k_score/k_cnt

submiss=pd.read_csv("data/sample_submission.csv")
submiss['0']=test_pred[:,0]
submiss['1']=test_pred[:,1]
submiss['2']=test_pred[:,2]
submiss['3']=test_pred[:,3]
submiss['4']=test_pred[:,4]
submiss['5']=test_pred[:,5]
submiss['6']=test_pred[:,6]
submiss.to_csv("results/xgb_{}_2.csv".format(k_cnt),index=False)
print(reverse_score)

# weigthed
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = weighted_test_pred / reverse_score
weighted_test_pred = np.round(weighted_test_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['5']=weighted_test_pred[:,5]
submiss['6']=weighted_test_pred[:,6]
submiss.to_csv("results/weighted_{}_2.csv".format(k_cnt),index=False)

# best single
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = np.round(best_single_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['4']=weighted_test_pred[:,5]
submiss['4']=weighted_test_pred[:,6]
submiss.to_csv("results/single_{}_2.csv".format(k_cnt),index=False)

# train log loss
print('local average valid loss',avg_k_score)
print('train log loss', log_loss(train_Y,org_train_pred))


[0]	train-mlogloss:1.84640	valid-mlogloss:1.84626
[200]	train-mlogloss:0.41434	valid-mlogloss:0.43447
[400]	train-mlogloss:0.37831	valid-mlogloss:0.42658
[600]	train-mlogloss:0.35116	valid-mlogloss:0.42510
[700]	train-mlogloss:0.33908	valid-mlogloss:0.42507
train log loss 0.33896605335426017 valid log loss 0.42508008223199656
rev 2.352497898158937
[0]	train-mlogloss:1.84580	valid-mlogloss:1.84683
[200]	train-mlogloss:0.40977	valid-mlogloss:0.45171
[400]	train-mlogloss:0.37392	valid-mlogloss:0.44345
[600]	train-mlogloss:0.34636	valid-mlogloss:0.44210
[706]	train-mlogloss:0.33387	valid-mlogloss:0.44219
train log loss 0.33375851362396747 valid log loss 0.4421902112723416
rev 2.2614702327368064
[0]	train-mlogloss:1.84581	valid-mlogloss:1.84672
[200]	train-mlogloss:0.40983	valid-mlogloss:0.45437
[400]	train-mlogloss:0.37321	valid-mlogloss:0.44742
[600]	train-mlogloss:0.34592	valid-mlogloss:0.44645
[668]	train-mlogloss:0.33783	valid-mlogloss:0.44667
train log loss 0.33783024450705385 valid l

In [74]:
single = pd.read_csv("results/single_5_2.csv")
weighted = pd.read_csv("results/weighted_5_2.csv")
xgboosted = pd.read_csv("results/xgb_5_2.csv")

In [76]:
idx_list = []
for i in xgboosted.index:
    val = xgboosted.loc[i,["0","1","2","3","4","5","6"]]
    val.to_list()
    idx = np.argmax(val)
    idx_list.append(idx)


In [77]:
submiss=pd.read_csv("data/sample_submission.csv")
submiss["topic_idx"] = idx_list

In [78]:
submiss.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [79]:
submiss.to_csv("results/xgboost_2.csv",index=False)

In [80]:
idx_list = []
for i in weighted.index:
    val = weighted.loc[i,["0","1","2","3","4","5","6"]]
    val.to_list()
    idx = np.argmax(val)
    idx_list.append(idx)


In [81]:
submiss=pd.read_csv("data/sample_submission.csv")
submiss["topic_idx"] = idx_list

In [82]:
submiss.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [83]:
submiss.to_csv("results/weighted_2.csv",index=False)