In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install tensorflow-gpu==1.15.0



In [3]:
pip install keras==2.2.4



In [4]:
pip install git+https://www.github.com/keras-team/keras-contrib.git


Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-miffmwcr
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-miffmwcr


In [5]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from nltk.tokenize import TweetTokenizer
from copy import deepcopy
import spacy


In [6]:
# Maximum length of comment
max_len = 128 
# Dimension of embedding vector
embedding_dim = 25 
# Max feature
max_feature = 10000

In [7]:
data = pd.read_csv('/content/drive/MyDrive//Group-18 Toxic Span Identification/tsd_train.csv')
dev = pd.read_csv('/content/drive/MyDrive//Group-18 Toxic Span Identification/tsd_trial.csv')
testd = pd.read_csv('/content/drive/MyDrive//Group-18 Toxic Span Identification/tsd_test.csv')
data_new = data
data.spans = data.spans.apply(literal_eval)
text_data = data['text'].values
lbl = [1 if len(s) > 0 else 0 for s in data.spans]

In [8]:
testd.describe()

Unnamed: 0,spans,text
count,2000,2000
unique,1034,2000
top,[],That's right. They are not normal. And I am st...
freq,394,1


In [9]:
data.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7939 entries, 0 to 7938
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spans   7939 non-null   object
 1   text    7939 non-null   object
dtypes: object(2)
memory usage: 124.2+ KB


In [11]:
data.describe()

Unnamed: 0,spans,text
count,7939,7939
unique,4438,7939
top,[],Another violent and aggressive immigrant killi...
freq,485,1


In [12]:
data['spans'].describe()

count     7939
unique    4438
top         []
freq       485
Name: spans, dtype: object

In [13]:
data['text'].describe()

count                                                  7939
unique                                                 7939
top       Another violent and aggressive immigrant killi...
freq                                                      1
Name: text, dtype: object

#Data Preprocessing and Data Cleaning


Renaming label of a column

In [14]:
col1 = data.rename(columns={'spans': 'Col1'})
col1

Unnamed: 0,Col1,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."
...,...,...
7934,"[8, 9, 10, 11]",Another fool pipes in.
7935,"[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 5...",So if a restaurant owner puts up a sign saying...
7936,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Any faith that can't stand up to logic and rea...
7937,"[5, 6, 7, 8, 9, 10, 11]",This idiotic. Use the surplus to pay down the ...


Checking null and duplicate values if any


In [15]:
print(data.shape)
print(data.isnull().values.any())
data.dropna(axis=0,inplace=True)
print(data.shape)
#no null values

(7939, 2)
False
(7939, 2)


In [16]:
data.drop_duplicates(subset=['text'],keep='first',inplace=True)
print(data.shape)
#no duplicates

(7939, 2)


In [17]:
tknzr2 = TweetTokenizer()

def custom_tokenizer(text_data):
    return tknzr2.tokenize(text_data)

def retrieve_word_from_span(lst_span, text):
    i = 0
    token = []
    a = 0

    word = []

    while (i < (len(lst_span) - 1)):
        if (lst_span[i] != (lst_span[i+1]-1)):
            token.append(lst_span[a:(i+1)])
            a = i + 1
        elif i == (len(lst_span) - 2):
            token.append(lst_span[a:i+2])

        i = i + 1

    for t in token:
        word.append(text[t[0]:(t[len(t)-1])+1])

    return word

def span_retrived(text_data, spans):
    token_labels = []

    for i in range(0, len(text_data)):
        token_labels.append(retrieve_word_from_span(spans[i], text_data[i]))
    
    return token_labels

def span_convert(text_data, spans):
    MAX_LEN = 0
    token_labels = []

    for i in range(0, len(text_data)):
        token_labels.append(retrieve_word_from_span(spans[i], text_data[i]))

    lst_seq = []
    for i in range(0, len(text_data)):
        # token = tknzr.tokenize(text_data[i])
        token = custom_tokenizer(text_data[i])
        if len(token) > MAX_LEN:
            MAX_LEN = len(token)
        seq = np.zeros(len(token), dtype=int)
        for j in range(0, len(token)):
            for t in token_labels[i]:
                # if token[j] in tknzr.tokenize(t):
                if token[j] in custom_tokenizer(t):
                    seq[j] = 1
        lst_seq.append(seq)     

    return (token_labels, lst_seq)

In [18]:
# convert data
data['token'], data['seq'] = span_convert(text_data, data.spans)
train = deepcopy(data)


In [19]:
data

Unnamed: 0,spans,text,token,seq
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...,[violent and aggressive immigrant],"[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio...",[fucking],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed.",[Damn],"[1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...,[knucklehead],"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA...",[killing],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
7934,"[8, 9, 10, 11]",Another fool pipes in.,[fool],"[0, 1, 0, 0, 0]"
7935,"[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 5...",So if a restaurant owner puts up a sign saying...,[No Blacks Allowed],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ..."
7936,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Any faith that can't stand up to logic and rea...,[Any faith that can't stand up to logic and re...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]"
7937,"[5, 6, 7, 8, 9, 10, 11]",This idiotic. Use the surplus to pay down the ...,[idiotic],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#Word Embbeding

In [20]:
# Read embedding
word_dict = []
embeddings_index = {}
f = open('/content/drive/MyDrive//Group-18 Toxic Span Identification/glove.twitter.27B.25d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] 
    word_dict.append(word)
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [21]:
words = word_dict
num_words = len(words)

# Dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {w : i + 2 for i, w in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# Dictionary lable:index pair
idx2word = {i: w for w, i in word_to_index.items()}

In [22]:
# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_to_index.items():
    if i > max_feature:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

In [23]:
 # mapping for token cases
case2Idx = {'1': 1, '0': 0}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')  # identity matrix used 

char2Idx = {"PADDING": 0, "UNKNOWN": 1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>":
    char2Idx[c] = len(char2Idx)

Encoding and padding

In [24]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.initializers import Constant
from nltk.corpus import stopwords
import re
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using TensorFlow backend.


In [25]:
def encoding(X, y, isTest = True):
    sentences = []
    
    for t in X:
        sentences.append(custom_tokenizer(t))

    X = []
    for s in sentences:
        sent = []
        for w in s:
            try:
                w = w.lower()
                sent.append(word_to_index[w])
            except:
                sent.append(word_to_index["UNK"])
        X.append(sent)
           
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index["PAD"])

    if isTest:
        y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=word_to_index["PAD"])
        y = to_categorical(y, num_classes=2)
    else:
        y = None

    return (X,y)

In [26]:
def decoding(text_data, encoding_text, prediction):
    test = [[idx2word[i] for i in row] for row in encoding_text]

    lst_token = []

    for t in range(0, len(test)):
        yy_pred = []
        for i in range(0, len(test[t])):
            if prediction[t][i] == 1:
                yy_pred.append(test[t][i])
        lst_token.append(yy_pred)

    lis_idx = []
    for i in range(0, len(text_data)):
        idx = []
        for t in lst_token[i]:
            index = text_data[i].find(t)
            idx.append(index)
            for j in range(1, len(t)):
                index = index + 1
                idx.append(index)
        lis_idx.append(idx)

    return lis_idx


In [27]:
y = data['seq']
X = data['text']

In [28]:
X1, y1 = encoding(X, y)


In [29]:
max_len = 128 
a, b = encoding(X, y)

In [30]:
print(X[0],"\n",a[0],"\n",y[0],"\n",b[0])


Another violent and aggressive immigrant killing a innocent and intelligent US Citizen.... Sarcasm 
 [  600 15659    28 18798 56932  2643    13  7866    28 11642   293 19565
     1  8053     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0] 
 [0 1 1 1 1 0 0 0 1 0 0 0 0 0] 
 [[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1.

# Bi-LSTM

Aplying Bi-LSTM

In [31]:
# BiLSTM - CRF 
from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Flatten, Dropout
from keras.models import Model, Input
from keras_contrib.layers import CRF
from keras.utils import plot_model

import warnings
warnings.filterwarnings("ignore")

# from keras.metrics import BinaryAccuracy, Precision, Recall, AUC

input = Input(shape = (max_len,))
model = Embedding(input_dim=num_words+2,
                    output_dim=embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_len,
                    trainable=True)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units = max_len, return_sequences=True, recurrent_dropout=0.1))(model)
model = TimeDistributed(Dense(2, activation="sigmoid"))(model)
model = Model(input, model)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

model.summary()
plot_model(model)
plot_model(model,to_file="bilstm-crf.pdf",show_shapes=True,show_layer_names=True)




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 25)           29837900  
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 25)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 256)          157696    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 128, 2)            514       
Total params: 29,996,110
Trainable params: 29,996,110
Non-t

In [41]:
from keras.callbacks import ModelCheckpoint
import warnings
warnings.filterwarnings("ignore")

model.fit(X1, np.array(y1), batch_size=64, epochs=15)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f61f7144d50>

# Evaluation Matric

In [42]:
# Evaluation metric

import sys
import os
import os.path
from scipy.stats import sem
import numpy as np
from ast import literal_eval

def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1. if len(predictions) == 0 else 0.
    if len(predictions) == 0:
        return 0.
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)


def evaluate(pred, gold):
    """
    Based on https://github.com/felipebravom/EmoInt/blob/master/codalab/scoring_program/evaluation.py
    :param pred: file with predictions
    :param gold: file with ground truth
    :return:
    """
    # # read the predictions
    # pred_lines = pred.readlines()
    # # read the ground truth
    # gold_lines = gold.readlines()

    pred_lines = pred
    gold_lines = gold

    # only when the same number of lines exists
    if (len(pred_lines) == len(gold_lines)):
        data_dic = {}
        for n, line in enumerate(gold_lines):
            parts = line.split('\t')
            if len(parts) == 2:
                data_dic[int(parts[0])] = [literal_eval(parts[1])]
            else:
                raise ValueError('Format problem for gold line %d.', n)

        for n, line in enumerate(pred_lines):
            parts = line.split('\t')
            if len(parts) == 2:
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(literal_eval(parts[1]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append([])
                else:
                    raise ValueError('Invalid text id for pred line %d.', n)
            else:
                raise ValueError('Format problem for pred line %d.', n)

        # lists storing gold and prediction scores
        scores = []
        for id in data_dic:
            if len(data_dic[id]) == 2:
                gold_spans = data_dic[id][0]
                pred_spans = data_dic[id][1]
                scores.append(f1(pred_spans, gold_spans))
            else:
                sys.exit('Repeated id in test data.')

        return (np.mean(scores), sem(scores))

Evatuation for Bi-LSTM

In [43]:
text_data_test = testd['text'].values
spans_test = testd['spans'].apply(literal_eval)
test_id = testd.index

In [45]:
testd['token'], testd['seq'] = span_convert(text_data_test, spans_test)
testd

Unnamed: 0,spans,text,token,seq
0,"[84, 85, 86, 87, 88, 89, 90, 91, 133, 134, 135...",That's right. They are not normal. And I am st...,"[ABNORMAL, sexist rubbish]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[81, 82, 83, 84, 85, 86]","""Watch people die from taking away their healt...",[stupid],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[],tens years ago i contacted the PDR and suggest...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[],The parallels between the ANC and the Sicilian...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,[],Intel Community: ‘How can we work for a Presid...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
1995,"[4, 5, 6, 7, 8, 70, 71, 72, 73, 74, 75, 76, 77...",hey loser change your name to something more a...,"[loser, ignorant, loser]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1996,"[23, 24, 25, 26, 27]",And you are a complete moron who obviously doe...,[moron],"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1997,"[157, 158, 159, 160, 161, 162, 163, 164, 165, ...",Such vitriol from the left. Who would have th...,[hypocrites],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1998,[],It is now time for most of you to expand your ...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [44]:
from keras.models import load_model
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy

# # load model
# model = load_model('drive/My Drive/CODE/SemVal/model/model_detection_4.h5', custom_objects={'CRF':CRF,'crf_loss':crf_loss,'crf_accuracy':crf_accuracy})
y_test = testd['seq']
X_test = testd['text']
a, b = encoding(X_test, y_test)
y_pred = model.predict(a)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(b, -1)


In [46]:
test = [[idx2word[i] for i in row] for row in a]


In [47]:
yy_pred = []
yy_test = []

for i in range(0, len(test[0])):
    if y_pred[0][i] == 1:
        yy_pred.append(test[0][i])

for i in range(0, len(test[0])):
    if y_test_true[0][i] == 1:
        yy_test.append(test[0][i])

print(yy_pred)
print(yy_test)

['sexist', 'rubbish']
['abnormal', 'sexist', 'rubbish']


In [48]:
raw_y = decoding(X_test, a, y_pred)


In [50]:
f1(raw_y[0], spans_test[0])

acc = []
for i in range(0, len(spans_test)):
    acc.append(f1(raw_y[i], spans_test[i]))

print(np.mean(acc)*100)

57.77725476087956
