In [2]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

from keras.utils import np_utils

import warnings 
warnings.filterwarnings(action='ignore')

import gc
import xgboost as xgb

In [3]:
train = pd.read_csv("data/train_data.csv",encoding="utf-8",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)
submission = pd.read_csv("data/sample_submission.csv")

In [3]:
from konlpy.tag import Mecab  
tokenizer = Mecab()
train["tokenized"] = [tokenizer.morphs(sentence) for sentence in train["title"]]
test["tokenized"] = [tokenizer.morphs(sentence) for sentence in test["title"]]

In [4]:
train.head()

Unnamed: 0,index,title,topic_idx,tokenized
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4,"[인천, →, 핀란드, 항공기, 결항, …, 휴가철, 여행객, 분통]"
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4,"[실리콘밸리, 넘어서, 겠, 다, …, 구글, 15, 조, 원, 들여, 美, 전역,..."
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4,"[이란, 외무, 긴장, 완화, 해결책, 은, 미국, 이, 경제, 전쟁, 멈추, 는, 것]"
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4,"[NYT, 클린턴, 측근, 韓, 기업, 특수, 관계, 조명, …, 공과, 사, 맞물..."
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4,"[시진핑, 트럼프, 에, 중미, 무역, 협상, 조속, 타결, 희망]"


In [5]:
for tokenized in train["tokenized"]:
    for token in tokenized:
        if len(token) == 1:
            tokenized.remove(token)  
            
for tokenized in test["tokenized"]:
    for token in tokenized:
        if len(token) == 1:
            tokenized.remove(token)


In [1]:
vocab_list = pd.concat([train,test])
vocab_list = vocab_list["tokenized"]
vocab_list[:20]

NameError: name 'pd' is not defined

In [10]:
from nltk import FreqDist
vocab = FreqDist(np.hstack(vocab_list))
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 31593


In [11]:
vocab_size = 5000
# 상위 vocab_size개의 단어만 보존
vocab = vocab.most_common(vocab_size)
print('단어 집합의 크기 : {}'.format(len(vocab)))

단어 집합의 크기 : 5000


In [12]:
word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
word_to_index['pad'] = 1
word_to_index['unk'] = 0

In [13]:
train_x = []
test_x = []

for line in train["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    train_x.append(temp)

for line in test["tokenized"]: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        try:
            temp.append(word_to_index[w]) # 글자를 해당되는 정수로 변환
        except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
            temp.append(word_to_index['unk']) # unk의 인덱스로 변환

    test_x.append(temp)

In [14]:
print(train_x[:20])
print(test_x[:20])

[[334, 0, 2535, 4771, 0, 2962, 0], [0, 0, 18, 295, 154, 9, 0, 1615, 2963], [54, 697, 985, 746, 0, 77, 84, 402, 4567, 125], [3582, 2657, 2964, 62, 1498, 450, 2247, 0, 0, 2], [550, 32, 0, 397, 202, 0, 2478, 793], [1649, 280, 0, 266, 2888, 254, 1207, 99], [477, 2061, 26, 2062, 1185, 241, 1262, 2889, 2, 15], [296, 335, 1475, 0, 0, 677, 32, 936, 538, 0, 174, 0], [812, 589, 766, 2353, 324, 960, 138, 145, 169], [710, 0, 13, 32, 3123, 339, 159, 3436, 85, 857], [0, 0, 1852, 1853, 3437, 551, 0], [54, 72, 961, 0, 455, 2536, 1762, 14, 1944], [0, 106, 678, 280, 803, 4171, 1291, 4772, 3215], [3836, 986, 1443, 1730, 241, 0, 0, 1499, 282], [95, 62, 68, 568, 16, 1894, 1071, 380], [0, 0, 1794, 643, 25, 47, 0, 215], [254, 1135, 0, 3708, 210, 1561, 202, 3124], [1263, 285, 30, 1650, 4172, 776, 393, 906], [285, 3583, 1035, 174, 987, 711, 804, 51, 1019], [0, 21, 0, 0, 0, 17, 320, 2658, 976]]
[[1399, 199, 57, 33, 0, 53, 1390, 333], [0, 1402, 0, 841, 414, 13, 3092], [133, 45, 166, 1097, 505, 2640, 4333, 2253, 

In [15]:
max_len = max(len(l) for l in train_x)
max_len = max(len(l) for l in test_x)
print(max_len)

18


In [16]:
for line in train_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.
        
for line in test_x:
    if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
        line += [word_to_index['pad']] * (max_len - len(line)) # 나머지는 전부 'pad' 토큰으로 채운다.

In [17]:
print('리뷰의 최대 길이 : %d' % max(len(l) for l in train_x))
print('리뷰의 최소 길이 : %d' % min(len(l) for l in train_x))
print('리뷰의 평균 길이 : %f' % (sum(map(len, train_x))/len(train_x)))
      
print('리뷰의 최대 길이 : %d' % max(len(l) for l in test_x))
print('리뷰의 최소 길이 : %d' % min(len(l) for l in test_x))
print('리뷰의 평균 길이 : %f' % (sum(map(len, test_x))/len(test_x)))

리뷰의 최대 길이 : 18
리뷰의 최소 길이 : 18
리뷰의 평균 길이 : 18.000000
리뷰의 최대 길이 : 18
리뷰의 최소 길이 : 18
리뷰의 평균 길이 : 18.000000


In [18]:
# 종속변수 데이터 전처리
train_y = np_utils.to_categorical(train["topic_idx"]) # Y_train 에 원-핫 인코딩
print(train_y)
print(train_y.shape)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
(45654, 7)


In [19]:
train_x = np.array(train_x)
test_x = np.array(test_x)
train_x

array([[ 334,    0, 2535, ...,    1,    1,    1],
       [   0,    0,   18, ...,    1,    1,    1],
       [  54,  697,  985, ...,    1,    1,    1],
       ...,
       [  23, 1880,  968, ...,    1,    1,    1],
       [2989,    4,    0, ...,    1,    1,    1],
       [ 968,    8,  244, ...,    1,    1,    1]])

In [20]:
#파라미터 설정
vocab_size = 5003 # 제일 많이 사용하는 사이즈
embedding_dim = 200  
max_length = 18    # 위에서 그래프 확인 후 정함
padding_type='post'
#oov_tok = "<OOV>"

In [21]:
# 양방향 LSTM 레이어를 사용한 모델 (model3) 정의
model = Sequential([Embedding(vocab_size, embedding_dim, input_length =max_length),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
        tf.keras.layers.Bidirectional(LSTM(units = 64)),
        Dense(7, activation='softmax')    # 결과값이 0~4 이므로 Dense(5)
    ])
    
model.compile(loss= 'categorical_crossentropy', #여러개 정답 중 하나 맞추는 문제이므로 손실 함수는 categorical_crossentropy
              optimizer= 'adam',
              metrics = ['accuracy']) 
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18, 200)           1000600   
_________________________________________________________________
bidirectional (Bidirectional (None, 18, 128)           135680    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 18, 128)           98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 7)                 903       
Total params: 1,334,815
Trainable params: 1,334,815
Non-trainable params: 0
_________________________________________________________________


In [23]:
train_y = train["topic_idx"]
len(train_x),len(train_y), len(test_x)

(45654, 45654, 9131)

In [29]:
train_pred_result, test_pred_result = np.zeros((45654,7)),np.zeros((9131,7))
best_val_train_pred, best_val_test_pred = np.zeros((45654,7)),np.zeros((9131,7))

In [33]:
# 계층 교차 검증
n_fold = 5  
seed = 42
MODEL_P = 'models/nn_model.h5'
FEAT_CNT = 5

cv = StratifiedKFold(n_splits = FEAT_CNT, shuffle=True, random_state=seed)

for i, (i_trn, i_val) in enumerate(cv.split(train_x, train_y), 1):
    print(f'training model for CV #{i}')
    
    model = Sequential([Embedding(vocab_size, embedding_dim, input_length =max_length),
            tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
            tf.keras.layers.Bidirectional(LSTM(units = 64, return_sequences = True)),
            tf.keras.layers.Bidirectional(LSTM(units = 64)),
            Dense(7, activation='softmax')
        ])

    model.compile(loss= 'categorical_crossentropy', 
                  optimizer= 'adam',
                  metrics = ['accuracy'])

    mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    es=EarlyStopping(monitor='val_loss', patience=2)
    
    np.random.seed(42)
    model.fit(train_x[i_trn], 
        to_categorical(train_y[i_trn]),
        validation_data=(train_x[i_val], to_categorical(train_y[i_val])),
        epochs=10,
        batch_size=512,
        callbacks=[es, mc])
    # feature 생성 1
    train_pred_result[i_trn] = model.predict(train_x[i_trn])
    test_pred_result += model.predict(test_x)/FEAT_CNT

    # feature 생성 2
    model = load_model(MODEL_P)
    best_val_train_pred[i_val] = model.predict(train_x[i_val])
    best_val_test_pred += model.predict(test_x)/FEAT_CNT

    del model
    gc.collect()
    print('------------------')

training model for CV #1
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.61476, saving model to models/nn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.61476 to 0.54190, saving model to models/nn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.54190 to 0.53448, saving model to models/nn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.53448
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.53448
------------------
training model for CV #2
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.66028, saving model to models/nn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.66028 to 0.56551, saving model to models/nn_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.56551
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.56551
------------------
training model for CV #3
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.61777, saving model to models/nn_model.h5
Epoch 2/10

Epoch 00002: val_

In [37]:
train_pred_result2, test_pred_result2 = np.zeros((45654,7)),np.zeros((9131,7))
best_val_train_pred2, best_val_test_pred2 = np.zeros((45654,7)),np.zeros((9131,7))

In [40]:
# 계층 교차 검증
n_fold = 5  
seed = 42
MODEL_P = 'models/dnn_model.h5'
FEAT_CNT = 5

cv = StratifiedKFold(n_splits = FEAT_CNT, shuffle=True, random_state=seed)

for i, (i_trn, i_val) in enumerate(cv.split(train_x, train_y), 1):
    print(f'training model for CV #{i}')
    
    model = Sequential([Embedding(vocab_size, embedding_dim, input_length =max_length),
            tf.keras.layers.GlobalAveragePooling1D(),
            Dense(128,activation="relu"),
            Dense(128,activation="relu"),
            Dropout(0.2),
            Dense(7, activation='softmax')
        ])

    model.compile(loss= 'categorical_crossentropy', 
                  optimizer= 'adam',
                  metrics = ['accuracy'])

    mc = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    es=EarlyStopping(monitor='val_loss', patience=2)
    
    np.random.seed(42)
    model.fit(train_x[i_trn], 
        to_categorical(train_y[i_trn]),
        validation_data=(train_x[i_val], to_categorical(train_y[i_val])),
        epochs=10,
        batch_size=512,
        callbacks=[es, mc])
    # feature 생성 1
    train_pred_result2[i_trn] = model.predict(train_x[i_trn])
    test_pred_result2 += model.predict(test_x)/FEAT_CNT

    # feature 생성 2
    model = load_model(MODEL_P)
    best_val_train_pred2[i_val] = model.predict(train_x[i_val])
    best_val_test_pred2 += model.predict(test_x)/FEAT_CNT

    del model
    gc.collect()
    print('------------------')

training model for CV #1
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.00370, saving model to models/dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 1.00370 to 0.53219, saving model to models/dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.53219 to 0.51366, saving model to models/dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.51366
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.51366
------------------
training model for CV #2
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.95880, saving model to models/dnn_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.95880 to 0.54337, saving model to models/dnn_model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.54337 to 0.51876, saving model to models/dnn_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.51876
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.51876
------------------
training model for CV #3
Epoch 1/10

Epoch 0

In [41]:
all_nn_train = np.hstack([train_pred_result, best_val_train_pred, 
                        train_pred_result2, best_val_train_pred2
                        ])

all_nn_test = np.hstack([test_pred_result, best_val_test_pred, 
                        test_pred_result2, best_val_test_pred2
                        ])

In [44]:
train.head()

Unnamed: 0,index,title,topic_idx,tokenized
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4,"[인천, 핀란드, 항공기, 결항, 휴가철, 여행객, 분통]"
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4,"[실리콘밸리, 넘어서, 다, 구글, 15, 원, 들여, 전역, 거점]"
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4,"[이란, 외무, 긴장, 완화, 해결책, 미국, 경제, 전쟁, 멈추, 것]"
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4,"[NYT, 클린턴, 측근, 기업, 특수, 관계, 조명, 공과, 맞물려, 종합]"
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4,"[시진핑, 트럼프, 중미, 무역, 협상, 조속, 타결, 희망]"


In [45]:
test.head()

Unnamed: 0,index,title,tokenized
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영,"[유튜브, 내달, 일, 까지, 크리에이터, 지원, 공간, 운영]"
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사,"[어버이날, 다가, 흐려져, 남부, 지방, 은, 황사]"
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다,"[내년, 부터, 국가, RD, 평가, 논문, 건수, 반영, 는다]"
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,"[김명자, 신임, 총, 회장, 원로, 젊, 과학자, 지혜, 모을]"
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,"[회색, 인간, 작가, 김동식, 심, 백, 새, 소설, 2, 출간]"


In [46]:
cols_to_drop = ['index', 'title','tokenized']
train_X = train.drop(cols_to_drop+['topic_idx'], axis=1).values
test_X = test.drop(cols_to_drop, axis=1).values


# print(f_train_X.shape, f_test_X.shape)

In [49]:
train_X = np.hstack([train_X, all_nn_train])
test_X = np.hstack([test_X, all_nn_test])

In [50]:
print(train_X.shape, test_X.shape)

(45654, 28) (9131, 28)


In [61]:
rnd = 42
k_cnt = FEAT_CNT

kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)

test_pred = None
weighted_test_pred = None
org_train_pred = None
avg_k_score = 0
reverse_score = 0
best_loss = 100
best_single_pred = None

train_Y = train_y

for train_index, test_index in kf.split(train_X,train_Y):
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = train_Y[train_index], train_Y[test_index]
    
    params = {
            'colsample_bytree': 0.7,
            'subsample': 0.8,
            'eta': 0.04,
            'max_depth': 3,
            'eval_metric':'mlogloss',
            'objective':'multi:softprob',
            'num_class':7,
            'tree_method':'gpu_hist'
    }
    
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_test, y_test)
    d_test = xgb.DMatrix(test_X)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    m = xgb.train(params, d_train, 2000, watchlist, 
                    early_stopping_rounds=50,
                    verbose_eval=200)
    
    train_pred = m.predict(d_train)
    valid_pred = m.predict(d_valid)
    tmp_train_pred = m.predict(xgb.DMatrix(train_X))
    
    train_score = log_loss(y_train,train_pred)
    valid_score = log_loss(y_test,valid_pred)
    print('train log loss',train_score,'valid log loss',valid_score)
    avg_k_score += valid_score
    rev_valid_score = 1.0/valid_score
    reverse_score += rev_valid_score
    print('rev',rev_valid_score)
    
    if test_pred is None:
        test_pred = m.predict(d_test)
        weighted_test_pred = test_pred*rev_valid_score
        org_train_pred = tmp_train_pred
        best_loss = valid_score
        best_single_pred = test_pred
    else:
        curr_pred = m.predict(d_test)
        test_pred += curr_pred
        weighted_test_pred += curr_pred*rev_valid_score
        org_train_pred += tmp_train_pred

        if valid_score < best_loss:
            print('BETTER')
            best_loss = valid_score
            best_single_pred = curr_pred

test_pred = test_pred / k_cnt
test_pred = np.round(test_pred,4)
org_train_pred = org_train_pred / k_cnt
avg_k_score = avg_k_score/k_cnt

submiss=pd.read_csv("data/sample_submission.csv")
submiss['0']=test_pred[:,0]
submiss['1']=test_pred[:,1]
submiss['2']=test_pred[:,2]
submiss['3']=test_pred[:,3]
submiss['4']=test_pred[:,4]
submiss['5']=test_pred[:,5]
submiss['6']=test_pred[:,6]
submiss.to_csv("results/xgb_{}.csv".format(k_cnt),index=False)
print(reverse_score)

# weigthed
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = weighted_test_pred / reverse_score
weighted_test_pred = np.round(weighted_test_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['5']=weighted_test_pred[:,5]
submiss['6']=weighted_test_pred[:,6]
submiss.to_csv("results/weighted_{}.csv".format(k_cnt),index=False)

# best single
submiss=pd.read_csv("data/sample_submission.csv")
weighted_test_pred = np.round(best_single_pred,4)
submiss['0']=weighted_test_pred[:,0]
submiss['1']=weighted_test_pred[:,1]
submiss['2']=weighted_test_pred[:,2]
submiss['3']=weighted_test_pred[:,3]
submiss['4']=weighted_test_pred[:,4]
submiss['4']=weighted_test_pred[:,5]
submiss['4']=weighted_test_pred[:,6]
submiss.to_csv("results/single_{}.csv".format(k_cnt),index=False)

# train log loss
print('local average valid loss',avg_k_score)
print('train log loss', log_loss(train_Y,org_train_pred))


[0]	train-mlogloss:1.82537	valid-mlogloss:1.82500
[200]	train-mlogloss:0.15633	valid-mlogloss:0.17238
[400]	train-mlogloss:0.12645	valid-mlogloss:0.16221
[600]	train-mlogloss:0.10792	valid-mlogloss:0.16078
[643]	train-mlogloss:0.10457	valid-mlogloss:0.16079
train log loss 0.10457366169590804 valid log loss 0.16078699502335145
rev 6.219408477997662
[0]	train-mlogloss:1.82516	valid-mlogloss:1.82520
[200]	train-mlogloss:0.15516	valid-mlogloss:0.17783
[400]	train-mlogloss:0.12458	valid-mlogloss:0.16954
[600]	train-mlogloss:0.10568	valid-mlogloss:0.16918
[618]	train-mlogloss:0.10424	valid-mlogloss:0.16922
train log loss 0.10423648896318827 valid log loss 0.16922324570107186
rev 5.909353622530513
[0]	train-mlogloss:1.82513	valid-mlogloss:1.82502
[200]	train-mlogloss:0.15322	valid-mlogloss:0.18491
[400]	train-mlogloss:0.12326	valid-mlogloss:0.17714
[534]	train-mlogloss:0.11039	valid-mlogloss:0.17687
train log loss 0.1103092626503029 valid log loss 0.17688975450160221
rev 5.653238667313218
[0]

In [63]:
single = pd.read_csv("results/single_5.csv")
weighted = pd.read_csv("results/single_5.csv")
xgboosted = pd.read_csv("results/xgb_5.csv")

In [93]:
# xgboosted["topic_idx"] = [ np.argmax(li,axis=1) for li in xgboosted[["0","1","2","3","4","5","6"]] ]
idx_list = []
for i in xgboosted.index:
    val = xgboosted.loc[i,["0","1","2","3","4","5","6"]]
    val.to_list()
    idx = np.argmax(val)
    idx_list.append(idx)


In [94]:
idx_list[:10]

[2, 3, 6, 2, 3, 3, 5, 3, 4, 4]

In [95]:
submiss=pd.read_csv("data/sample_submission.csv")
submiss["topic_idx"] = idx_list

In [96]:
submiss.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,6
3,45657,2
4,45658,3


In [98]:
submiss.to_csv("results/xgboost_1.csv",index=False)