In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import os

In [2]:
from keras import backend as K
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM, Bidirectional, Permute, Input, Lambda, RepeatVector, Multiply, Flatten
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# hypers
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 60
LSTM_SIZE = 300
DROP_RATE = 0.3
L_RATE = 1e-3
BATCH_SIZE = 64
NUM_EPOCH = 4



In [4]:
df_train = pd.read_csv( "./olid-training-v1.0.tsv", sep="\t" )

In [5]:
df_train.shape

(13240, 5)

In [6]:
df_train.loc[0, :]

id                                                       86426
tweet        @USER She should ask a few native Americans wh...
subtask_a                                                  OFF
subtask_b                                                  UNT
subtask_c                                                  NaN
Name: 0, dtype: object

In [7]:
df_test_a = pd.read_csv( "./testset_a.tsv", sep="\t" )
df_test_b = pd.read_csv( "./testset_b.tsv", sep="\t" )
df_test_c = pd.read_csv( "./testset_c.tsv", sep="\t" )

In [8]:
df_test_a.loc[0 ,:]

id                                                   15923
tweet    #WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...
label                                                  OFF
Name: 0, dtype: object

In [9]:
[ a.shape for a in [ df_test_a, df_test_b, df_test_c ] ]

[(860, 3), (240, 3), (213, 3)]

In [10]:
GLOVE_DIR = "/nfs/nas-7.1/cflin/glove.6B/"

In [11]:
texts = df_train["tweet"].tolist() + df_test_a['tweet'].tolist()

In [12]:
len(texts)

14100

In [13]:
ls_len = [ len( w.split() ) for w in texts ]

In [14]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
# data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)



In [None]:
# data.shape

In [15]:
def sentence_to_padded_seq(sentence):
        return pad_sequences( tokenizer.texts_to_sequences([sentence]), maxlen=MAX_SEQUENCE_LENGTH )[0]

In [16]:
df_train['seq'] = df_train['tweet'].map( lambda x: sentence_to_padded_seq(x) )
df_test_a['seq'] = df_test_a['tweet'].map( lambda x: sentence_to_padded_seq(x) )
df_test_b['seq'] = df_test_b['tweet'].map( lambda x: sentence_to_padded_seq(x) )
df_test_c['seq'] = df_test_c['tweet'].map( lambda x: sentence_to_padded_seq(x) )

In [17]:
# df_train['seq'][0], df_train['tweet'][0]
# df_train['seq'].tolist()

idx_task_b = df_train.loc[ df_train['subtask_b'].notna(), : ].index
idx_task_c = df_train.loc[ df_train['subtask_c'].notna(), : ].index

In [18]:
y_train = df_train['subtask_a']
y_train = [ 0 if w == "NOT" else 1 for w in y_train]
# print( Counter(y_train) )
y_train = to_categorical(y_train)

In [19]:
y_train_b = df_train['subtask_b'][idx_task_b]
y_train_b = [ 0 if w == "UNT" else 1 for w in y_train_b]
# print( Counter(y_train) )
y_train_b = to_categorical(y_train_b)

In [20]:
y_train_c = df_train['subtask_c'][idx_task_c]
# y_train_b = [ 0 if w == "UNT" else 1 for w in y_train_b]

ls_tmp = []
for w in y_train_c:
    if w == "IND":
        ls_tmp.append( 0 )
    elif w == "GRP":
        ls_tmp.append( 1 )
    else:
        ls_tmp.append( 2 )
y_train_c = ls_tmp
print( Counter( y_train_c ) )

y_train_c = to_categorical(y_train_c)

Counter({0: 2407, 1: 1074, 2: 395})


In [21]:
y_train_b.shape

(4400, 2)

In [22]:
Counter( df_train['subtask_b'] )

Counter({'UNT': 524, 'TIN': 3876, nan: 8840})

In [23]:
Counter(df_test_c['label'])

Counter({'OTH': 35, 'GRP': 78, 'IND': 100})

In [24]:
y_test_a = df_test_a['label']
y_test_a = [ 0 if w == "NOT" else 1 for w in y_test_a]
y_test_a = to_categorical(y_test_a)

In [25]:
y_test_b = df_test_b['label']
y_test_b = [ 0 if w == "UNT" else 1 for w in y_test_b]
print( Counter( y_test_b ) )
y_test_b = to_categorical(y_test_b)

Counter({1: 213, 0: 27})


In [26]:
y_test_c = df_test_c['label']
ls_tmp = []
for w in y_test_c:
    if w == "IND":
        ls_tmp.append( 0 )
    elif w == "GRP":
        ls_tmp.append( 1 )
    else:
        ls_tmp.append( 2 )
y_test_c = ls_tmp
print( Counter( y_test_c ) )
y_test_c = to_categorical(y_test_c)

Counter({0: 100, 1: 78, 2: 35})


In [27]:
y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [28]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt' ), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [29]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
        

In [30]:
inputs = Input( shape=(MAX_SEQUENCE_LENGTH,) )

m = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            name="embedding_layer")(inputs)

m = Bidirectional(LSTM(LSTM_SIZE, return_sequences=False), name="biLSTM")(m)
# m = Dropout(DROP_RATE)(m)
# output = Dense(2, activation='softmax')(m) # subtask a&b
output = Dense(3, activation='softmax')(m) # subtask c
model = Model(input=inputs, output=output)

  


In [None]:
model.summary()

In [31]:
adam = optimizers.Adam(lr=L_RATE)

In [32]:
model.compile(adam, 'binary_crossentropy', metrics=['accuracy'])

In [33]:
df_train.loc[ idx_task_b, :].shape

(4400, 6)

In [34]:
Counter( df_train['subtask_b'] )

Counter({'UNT': 524, 'TIN': 3876, nan: 8840})

In [35]:
X_train = np.array(df_train['seq'].tolist())
X_train_b = X_train[ idx_task_b ]
X_train_c = X_train[ idx_task_c ]

In [36]:
len(X_train), len(X_train_b), len(X_train_c)

(13240, 4400, 3876)

In [37]:
model.fit(X_train_c, y_train_c,
          batch_size=BATCH_SIZE,
          epochs=NUM_EPOCH,
#           validation_data=[x_val, y_val]
         )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7feb381c9908>

In [38]:
pred = model.predict( np.array( df_test_c['seq'].tolist() ) )

In [39]:
pred.shape

(213, 3)

In [40]:
pred_max = np.argmax( pred, axis=1 )

In [41]:
pred_max.shape

(213,)

In [42]:
from sklearn.metrics import f1_score



f1_score(y_true= np.argmax(y_test_c, axis=1), y_pred=pred_max, average='macro')


0.48148148148148145