In [1]:
import pandas as pd
import numpy as np
import os 

from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="3";  

In [3]:
df_train = pd.read_csv("train.csv", usecols=["Sent_ID","Word","tag"])
df_train["Word"] = df_train.Word.fillna("NAN")

In [4]:
df_train.tail()

Unnamed: 0,Sent_ID,Word,tag
4543828,191282,translational,O
4543829,191282,or,O
4543830,191282,post-translational,O
4543831,191282,level,O
4543832,191282,.,O


In [5]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced',
                                             np.unique(df_train.tag.values),
                                             df_train.tag.values)

In [6]:
 np.unique(df_train.tag.values)

array(['B-indications', 'I-indications', 'O'], dtype=object)

In [7]:
dict_w = {'B-indications':28.57594853, 'I-indications': 33.94162334, 'O':0.34065246}

In [8]:
class_weight

array([28.57594853, 33.94162334,  0.34065246])

In [9]:
df_test = pd.read_csv("test.csv", usecols=["Sent_ID","Word"])
df_test["Word"] = df_test.Word.fillna("NAN")
df = pd.concat([df_train, df_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
df_test.shape

(2994463, 2)

In [11]:
df.tail()

Unnamed: 0,Sent_ID,Word,tag
2994458,317122,/a,
2994459,317122,>,
2994460,317122,<,
2994461,317122,/p,
2994462,317122,>,


In [12]:
print(df_train['Sent_ID'].nunique(), df_train.Word.nunique(), df_train.tag.nunique(),  df_train.Word.count())

191282 184506 3 4543833


In [13]:
print(df_test['Sent_ID'].nunique(), df_test.Word.nunique(), df_test.Word.count())

125840 139892 2994463


In [14]:
print(df['Sent_ID'].nunique(), df.Word.nunique(), df.tag.nunique(),  df.Word.count())

317122 257202 3 7538296


In [15]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),  
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby(["Sent_ID",data.index // 100 * 100]).apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s 
        except:
            return None

In [16]:
getter_train = SentenceGetter(df_train)

sentences_train = getter_train.sentences

In [17]:
len(sentences_train)

234843

In [18]:
getter = SentenceGetter(df)

sentences = getter.sentences

In [19]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 100


In [20]:
len(sentences)

389383

In [21]:
sentences[1]

[('We', 'O'),
 ('have', 'O'),
 ('reviewed', 'O'),
 ('the', 'O'),
 ('distinctive', 'O'),
 ('features', 'O'),
 ('of', 'O'),
 ('excess', 'O'),
 ('weight', 'O'),
 (',', 'O'),
 ('its', 'O'),
 ('causes', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('related', 'O'),
 ('prevention', 'O'),
 ('and', 'O'),
 ('management', 'O'),
 ('efforts', 'O'),
 (',', 'O'),
 ('as', 'O'),
 ('well', 'O'),
 ('as', 'O'),
 ('data', 'O'),
 ('gaps', 'O'),
 ('and', 'O'),
 ('recommendations', 'O'),
 ('for', 'O'),
 ('future', 'O'),
 ('research', 'O'),
 ('in', 'O'),
 ('low-', 'O'),
 ('and', 'O'),
 ('middle-income', 'O'),
 ('countries', 'O'),
 ('(', 'O'),
 ('LMICs', 'O'),
 (')', 'O'),
 ('.', 'O')]

In [22]:
sentences_test = sentences[234843:]

In [23]:
len(sentences_test)

154540

In [24]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

257203

In [25]:
n_words = len(words); n_words

257203

In [26]:
tags = list(set(df_train["tag"].values))
n_tags = len(tags); n_tags

3

In [27]:
tags = list(set(df_train["tag"].values))
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
#tag2idx.pop(np.nan)
print(tag2idx)

{'I-indications': 0, 'O': 1, 'B-indications': 2}


In [28]:
dict_w

{'B-indications': 28.57594853, 'I-indications': 33.94162334, 'O': 0.34065246}

In [29]:
cweights = [33.94162334, 0.34065246, 28.57594853]

In [30]:
from keras.preprocessing.sequence import pad_sequences
X_train = [[word2idx[w[0]] for w in s] for s in sentences_train]
X_train = pad_sequences(maxlen=100, sequences=X_train, padding="post", value=n_words - 1)

In [31]:
X_test = [[word2idx[w[0]] for w in s] for s in sentences_test]
X_test = pad_sequences(maxlen=100, sequences=X_test, padding="post", value=n_words - 1)

In [32]:
y_train = [[tag2idx[w[1]] for w in s] for s in sentences_train]
y_train = pad_sequences(maxlen=100, sequences=y_train, padding="post", value=tag2idx["O"])

In [33]:
from keras.utils import to_categorical
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

In [34]:
input = Input(shape=(100,))
model = Embedding(input_dim=n_words, output_dim=100, input_length=100)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [38]:
from keras import backend as K
import tensorflow as tf 

def f1(y_true, y_pred):
    y_pred = tf.convert_to_tensor(y_pred, np.float32)
    y_true = tf.convert_to_tensor(y_true, np.float32)
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [39]:
# Early stopping & checkpointing the best model in ../working dir & restoring that as our model for prediction
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

#cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = './working/best_w3.hdf5', save_best_only=True, save_weights_only=True, verbose=1, monitor='val_f1', mode='max')

In [40]:
model = Model(input, out)

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=[f1])

In [41]:
history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=10, validation_split=0.2, verbose=1,class_weight=cweights, callbacks=[cb_checkpointer])

Train on 187874 samples, validate on 46969 samples
Epoch 1/10

Epoch 00001: val_f1 improved from -inf to 0.99747, saving model to ./working/best_w3.hdf5
Epoch 2/10

Epoch 00002: val_f1 improved from 0.99747 to 0.99798, saving model to ./working/best_w3.hdf5
Epoch 3/10

Epoch 00003: val_f1 improved from 0.99798 to 0.99810, saving model to ./working/best_w3.hdf5
Epoch 4/10

Epoch 00004: val_f1 did not improve from 0.99810
Epoch 5/10

Epoch 00005: val_f1 did not improve from 0.99810
Epoch 6/10

Epoch 00006: val_f1 did not improve from 0.99810
Epoch 7/10
  5120/187874 [..............................] - ETA: 3:49 - loss: 0.0018 - f1: 0.9994

KeyboardInterrupt: 

In [42]:
model.load_weights("./working/best_w3.hdf5")

In [43]:
pred1 = model.predict(X_test, batch_size=256, verbose=1)



In [44]:
len(X_test)

154540

In [45]:
np.array(pred1).shape

(154540, 100, 3)

In [46]:
df_pred = pd.DataFrame(np.argmax(pred1.reshape(15454000,-1), axis=1), columns=["target"])

In [47]:
df_pred.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [48]:
df_pred.shape

(15454000, 1)

In [49]:
df_pred.reset_index().groupby("target").count()

Unnamed: 0_level_0,index
target,Unnamed: 1_level_1
0,25217
1,15398446
2,30337


In [50]:
X_test[0]

array([173215,  78175,  15339,  78175, 103683,  55725,  94024, 126977,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202], dtype=int32)

In [51]:
word2idx["ENDPAD"]

257202

In [52]:
df_pred["word"] = X_test.reshape(15454000,-1)

In [53]:
df_pred.head(10)

Unnamed: 0,target,word
0,1,173215
1,1,78175
2,1,15339
3,1,78175
4,1,103683
5,1,55725
6,1,94024
7,1,126977
8,1,257202
9,1,257202


In [54]:
df_pred_last = df_pred[df_pred["word"] != 257202]

In [55]:
df_pred_last.shape

(2994463, 2)

In [56]:
df_test2 = pd.read_csv("test.csv", usecols=["id","Sent_ID"])

In [57]:
df_test2.head()

Unnamed: 0,id,Sent_ID
0,4543834,191283
1,4543835,191283
2,4543836,191283
3,4543837,191283
4,4543838,191283


In [58]:
tag2idx

{'I-indications': 0, 'O': 1, 'B-indications': 2}

In [59]:
idx2tag = {2:'B-indications',1:'O', 0:'I-indications'}

In [60]:
df_pred_last2 = df_pred_last.replace({"target": idx2tag})

In [61]:
df_pred_last2.head()

Unnamed: 0,target,word
0,O,173215
1,O,78175
2,O,15339
3,O,78175
4,O,103683


In [62]:
df_test2["tag"] = df_pred_last2.target.values

In [63]:
df_test2.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


In [64]:
df_test2.to_csv("submission3.csv", index=False)