http://blog.naver.com/PostView.nhn?blogId=hist0134&logNo=220944328300

In [13]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


In [7]:
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'


In [37]:
# 댓글의 단어 최대 갯수
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
MAX_LEN = 220

In [3]:
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')

In [9]:
# 댓글 리스트
x_train = train_df[TEXT_COLUMN].astype(str)

# 독성 리스트
y_train = train_df[TARGET_COLUMN].values

# 독성 종류 리스트
y_aux_train = train_df[AUX_COLUMNS].values

# test 댓글 리스트
x_test = test_df[TEXT_COLUMN].astype(str)


In [34]:
# 정체성과 독성을 0.5를 기준으로 True False 로 나누어 저장한다.

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column]>=0.5, True, False)

In [35]:
train_df

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,False,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
1,59849,False,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
2,59852,False,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
3,59855,False,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
4,59856,True,haha you guys are a bunch of losers.,0.021277,0.000000,0.021277,0.872340,0.0,0.0,0.000000,...,2006,rejected,0,0,0,1,0,0.000000,4,47
5,59859,True,ur a sh*tty comment.,0.047619,0.638095,0.000000,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,False,hahahahahahahahhha suck it.,0.050847,0.305085,0.000000,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,False,FFFFUUUUUUUUUUUUUUU,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
8,239575,False,The ranchers seem motivated by mostly by greed...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,26662,approved,0,0,0,0,0,0.000000,0,4
9,239576,False,It was a great show. Not a combo I'd of expect...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,26650,approved,0,0,0,1,0,0.000000,0,4


In [14]:
tokenizer = text.Tokenizer(filters = CHARS_TO_REMOVE)

In [16]:
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [20]:
# 단어에 인덱스를 붙인다.
tokenizer.word_index

{'the': 1,
 'to': 2,
 'and': 3,
 'of': 4,
 'a': 5,
 'is': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'i': 10,
 'you': 11,
 'for': 12,
 's': 13,
 'are': 14,
 'not': 15,
 'be': 16,
 't': 17,
 'on': 18,
 'have': 19,
 'they': 20,
 'this': 21,
 'with': 22,
 'as': 23,
 'was': 24,
 'or': 25,
 'we': 26,
 'he': 27,
 'but': 28,
 'if': 29,
 'all': 30,
 'what': 31,
 'will': 32,
 'their': 33,
 'by': 34,
 'who': 35,
 'so': 36,
 'from': 37,
 'your': 38,
 'at': 39,
 'no': 40,
 'can': 41,
 'do': 42,
 'people': 43,
 'would': 44,
 'about': 45,
 'has': 46,
 'there': 47,
 'an': 48,
 'more': 49,
 'his': 50,
 'one': 51,
 'just': 52,
 'like': 53,
 'trump': 54,
 'out': 55,
 'when': 56,
 'up': 57,
 'don': 58,
 'our': 59,
 'how': 60,
 'them': 61,
 'my': 62,
 'get': 63,
 'should': 64,
 'than': 65,
 'us': 66,
 'been': 67,
 'were': 68,
 'only': 69,
 'time': 70,
 'any': 71,
 'some': 72,
 'other': 73,
 'because': 74,
 'now': 75,
 'think': 76,
 'those': 77,
 'why': 78,
 'many': 79,
 'know': 80,
 'good': 81,
 'then': 82,
 'wh

In [18]:
# 인덱스로 문장을 변환시킨다.

x_train = tokenizer.texts_to_sequences(x_train)

In [25]:
x_train[0]

[21,
 6,
 36,
 2225,
 9,
 13,
 53,
 44,
 11,
 107,
 38,
 1059,
 2,
 194,
 21,
 126,
 167,
 365,
 96,
 226]

In [None]:
x_test = tokenizer.texts_to_sequences(x_test)

In [30]:
# 가장 긴 문장을 기준으로 인덱스의 갯수를 정하고, 부족한 인덱스는 0으로 채운다

x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [32]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [38]:

sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

checkpoint_predictions = []
weights = []

for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    for global_epoch in range(EPOCHS):
        model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)],
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))
            ]
        )
        checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)


NameError: name 'EMBEDDING_FILES' is not defined