In [1]:
import pandas as pd
import numpy as np

from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [3]:
df_train = pd.read_csv("train.csv", usecols=["Sent_ID","Word","tag"])
df_train["Word"] = df_train.Word.fillna("NAN")

In [4]:
df_train.tail()

Unnamed: 0,Sent_ID,Word,tag
4543828,191282,translational,O
4543829,191282,or,O
4543830,191282,post-translational,O
4543831,191282,level,O
4543832,191282,.,O


In [73]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced',
                                             np.unique(df_train.tag.values),
                                             df_train.tag.values)

In [74]:
class_weight

array([28.57594853, 33.94162334,  0.34065246])

In [6]:
df_test = pd.read_csv("test.csv", usecols=["Sent_ID","Word"])
df_test["Word"] = df_test.Word.fillna("NAN")
df = pd.concat([df_train, df_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
df_test.shape

(2994463, 2)

In [7]:
df.tail()

Unnamed: 0,Sent_ID,Word,tag
2994458,317122,/a,
2994459,317122,>,
2994460,317122,<,
2994461,317122,/p,
2994462,317122,>,


In [50]:
print(df_train['Sent_ID'].nunique(), df_train.Word.nunique(), df_train.tag.nunique(),  df_train.Word.count())

191282 184506 3 4543833


In [54]:
print(df_test['Sent_ID'].nunique(), df_test.Word.nunique(), df_test.Word.count())

125840 139892 2994463


In [51]:
print(df['Sent_ID'].nunique(), df.Word.nunique(), df.tag.nunique(),  df.Word.count())

317122 257202 3 7538296


In [11]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),  
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby(["Sent_ID",data.index // 100 * 100]).apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s 
        except:
            return None

In [12]:
getter_train = SentenceGetter(df_train)

sentences_train = getter_train.sentences

In [13]:
len(sentences_train)

234843

In [14]:
getter = SentenceGetter(df)

sentences = getter.sentences

In [15]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 100


In [16]:
len(sentences)

389383

In [17]:
sentences[1]

[('We', 'O'),
 ('have', 'O'),
 ('reviewed', 'O'),
 ('the', 'O'),
 ('distinctive', 'O'),
 ('features', 'O'),
 ('of', 'O'),
 ('excess', 'O'),
 ('weight', 'O'),
 (',', 'O'),
 ('its', 'O'),
 ('causes', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('related', 'O'),
 ('prevention', 'O'),
 ('and', 'O'),
 ('management', 'O'),
 ('efforts', 'O'),
 (',', 'O'),
 ('as', 'O'),
 ('well', 'O'),
 ('as', 'O'),
 ('data', 'O'),
 ('gaps', 'O'),
 ('and', 'O'),
 ('recommendations', 'O'),
 ('for', 'O'),
 ('future', 'O'),
 ('research', 'O'),
 ('in', 'O'),
 ('low-', 'O'),
 ('and', 'O'),
 ('middle-income', 'O'),
 ('countries', 'O'),
 ('(', 'O'),
 ('LMICs', 'O'),
 (')', 'O'),
 ('.', 'O')]

In [18]:
sentences_test = sentences[234843:]

In [52]:
len(sentences_test)

154540

In [19]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

257203

In [20]:
n_words = len(words); n_words

257203

In [21]:
tags = list(set(df_train["tag"].values))
n_tags = len(tags); n_tags

3

In [22]:
tags = list(set(df_train["tag"].values))
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
#tag2idx.pop(np.nan)
print(tag2idx)

{'B-indications': 0, 'O': 1, 'I-indications': 2}


In [24]:
from keras.preprocessing.sequence import pad_sequences
X_train = [[word2idx[w[0]] for w in s] for s in sentences_train]
X_train = pad_sequences(maxlen=100, sequences=X_train, padding="post", value=n_words - 1)

In [25]:
X_test = [[word2idx[w[0]] for w in s] for s in sentences_test]
X_test = pad_sequences(maxlen=100, sequences=X_test, padding="post", value=n_words - 1)

In [33]:
y_train = [[tag2idx[w[1]] for w in s] for s in sentences_train]
y_train = pad_sequences(maxlen=100, sequences=y_train, padding="post", value=tag2idx["O"])

In [34]:
from keras.utils import to_categorical
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

In [29]:
input = Input(shape=(100,))
model = Embedding(input_dim=n_words, output_dim=100, input_length=100)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [30]:
model = Model(input, out)

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [35]:
history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=1, validation_split=0.2, verbose=1)

Train on 187874 samples, validate on 46969 samples
Epoch 1/1


In [37]:
pred1 = model.predict(X_test, batch_size=256, verbose=1)



In [48]:
len(X_test)

154540

In [38]:
np.array(pred1).shape

(154540, 100, 3)

In [39]:
df_pred = pd.DataFrame(np.argmax(pred1.reshape(15454000,-1), axis=1), columns=["pred"])

In [40]:
df_pred.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [45]:
df_pred.shape

(15454000, 1)

In [43]:
df_pred.reset_index().groupby("target").count()

Unnamed: 0_level_0,index
target,Unnamed: 1_level_1
0,26202
1,15407255
2,20543


In [57]:
X_test[0]

array([ 27763, 223283,  33865, 223283, 187355,  76006,  78082, 148651,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202, 257202, 257202, 257202, 257202,
       257202, 257202, 257202, 257202], dtype=int32)

In [55]:
word2idx["ENDPAD"]

257202

In [58]:
df_pred["word"] = X_test.reshape(15454000,-1)

In [59]:
df_pred.head(10)

Unnamed: 0,target,word
0,1,27763
1,1,223283
2,1,33865
3,1,223283
4,1,187355
5,1,76006
6,1,78082
7,1,148651
8,1,257202
9,1,257202


In [61]:
df_pred_last = df_pred[df_pred["word"] != 257202]

In [80]:
df_test2 = pd.read_csv("test.csv", usecols=["id","Sent_ID"])

In [88]:
df_test2.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


In [87]:
df_test2["tag"] = df_pred_last2.target.values

In [83]:
tag2idx

{'B-indications': 0, 'O': 1, 'I-indications': 2}

In [84]:
idx2tag = {0:'B-indications',1:'O', 2:'I-indications'}

In [85]:
df_pred_last2 = df_pred_last.replace({"target": idx2tag})

In [86]:
df_pred_last2.head()

Unnamed: 0,target,word
0,O,27763
1,O,223283
2,O,33865
3,O,223283
4,O,187355


In [82]:
df_pred_last.head()

Unnamed: 0,target,word
0,1,27763
1,1,223283
2,1,33865
3,1,223283
4,1,187355


In [62]:
df_pred_last.groupby("target").count()

Unnamed: 0_level_0,word
target,Unnamed: 1_level_1
0,26202
1,2947718
2,20543


In [89]:
df_test2.to_csv("submission.csv", index=False)

In [90]:
df_test2.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O
