In [285]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Dense, SimpleRNN

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from pprint import pprint

In [286]:
## Getting data.
# df_train = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/data/WIESP2022-NER-TRAINING.parquet?raw=true')
# df_train = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/WIESP_TRAINING_PREPROCESSED.parquet?raw=true')
df_train = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/WIESP_TRAINING_SPLITTED.parquet?raw=true')
# df_val = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/data/WIESP2022-NER-DEV.parquet?raw=true')
# df_test = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/data/WIESP2022-NER-VALIDATION-NO-LABELS.parquet?raw=true')
print(df_train.head())

In [287]:
df_train.columns

In [288]:
print('Training Data has shape: ', df_train.shape)

In [289]:
df_train = df_train[~df_train['tokens'].isin(list('./|:(),;[]{}-'))]
print('Training Data has shape: ', df_train.shape)

In [290]:
## Creating an index mapping for easier understanding.
def index_mapper(df, train=False):
  if not train:
      num_sentences = df['bibcode'].unique()
      index = np.arange(1, len(num_sentences)+1, 1)
      index_dict = dict(zip(num_sentences, index))

      index_mapping = [index_dict[bibcode] for bibcode in df['bibcode']]
      df['index'] = index_mapping
      df.set_index('index', inplace=True)
  else:
      df.set_index('sentence_id', inplace=True)
  return df

df_train = index_mapper(df_train, train=False)
# df_val = index_mapper(df_val)
# df_test = index_mapper(df_test)

In [291]:
## Choose only necessary columns.
df_train_sel = df_train[['tokens', 'ner_tags']]
# df_val_sel = df_val[['tokens', 'ner_tags']]
# df_test_sel = df_test[['tokens']]
df_train_sel.head()

In [292]:
classes = list(df_train_sel.ner_tags.unique())
classes_without_o = classes[1:]

In [293]:
df_train_sel.isna().sum()

In [294]:
# Plot length of the sentences.
index, length = np.unique(df_train_sel.index, return_counts=True)
fig, ax = plt.subplots(figsize=[25,6])
N, bins, patches = ax.hist(length, bins=100)
plt.show()

print('The number of sentences in the dataframe are: ', len(length))   ## Number of sentences

In [295]:
## Finding suitable sentences.
# b1 = length>10
# b2 = length<200
# b3 = length>420
# b4 = length<600

b1 = length>10
b2 = length<620

# b5 = np.logical_and(b1, b2)
# b6 = np.logical_and(b3, b4)
# b7 = np.logical_or(np.logical_and(b1, b2), np.logical_and(b3, b4))

b7 = np.logical_and(b1, b2)

In [296]:
## Chossing neccessary sentences.
index = index[b7]
length = length[b7]
df_train_sel = df_train_sel.loc[index]
print('The number of sentences in the dataframe are: ', len(length))   ## Number of sentences

In [297]:
# ## Splitting Data into Training and Validation.
# index = np.unique(df_train.index)

# train_index, val_index = train_test_split(index, train_size=0.8, random_state=1)

# df_train = df_train.loc[train_index]
# df_val = df_train.loc[val_index]

In [298]:
df_train_bert = df_train_sel.reset_index()
df_train_bert.head()

In [299]:
## Encoding the NER labels.
le = LabelEncoder()
df_train_sel['ner_tags'] = le.fit_transform(df_train_sel['ner_tags'])
# df_val_sel['ner_tags'] = le.transform(df_val_sel['ner_tags'])
df_train_sel.head()

In [300]:
## Because padded variable has to be kept as 0 and not -1. Else label encode would trouble it!
df_train_sel.ner_tags += 1

In [301]:
df_train_sel.head()

In [302]:
# ## Remove characters
# df_train.tokens = df_train.tokens.str.replace("[^a-zA-Z0-9 ]",'')
# df_val.tokens = df_val.tokens.str.replace("[^a-zA-Z0-9 ]",'')

In [303]:
# Tokenize the words
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='-"%#@!&_|.,~`%\?$',
                                                  lower=False, 
                                                  split=' ',
                                                  num_words=2000,
                                                  oov_token='<UNK>')       # Initialize

# tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True, split=' ') # Initialize
tokenizer.fit_on_texts(df_train_sel.tokens.values) # Fit on training data

In [304]:
# Transform to numeric
tokens_train  = tokenizer.texts_to_sequences(df_train_sel.tokens.values)
# tokens_val    = tokenizer.texts_to_sequences(df_val_sel.tokens.values)
# tokens_test   = tokenizer.texts_to_sequences(df_test_sel.tokens.values)

In [305]:
# ' '.join(df_train_sel.loc[3,'tokens'])

In [306]:
# Remove empty elements from training data
b = np.array([token!=[] for token in tokens_train])
df_train_sel = df_train_sel.iloc[b,:]
tokens_train = np.array(tokens_train)[b]

# # Remove empty elements from validation data
# b = np.array([token!=[] for token in tokens_val])
# df_val_sel = df_val_sel.iloc[b,:]
# tokens_val = np.array(tokens_val)[b]

# # Remove empty elements from test data
# b = np.array([token!=[] for token in tokens_test])
# df_test_sel = df_test_sel.iloc[b,:]
# tokens_test = np.array(tokens_test)[b]

In [307]:
df_train_sel.tokens = tokens_train
# df_val_sel.tokens = tokens_val
# df_test_sel.tokens = tokens_test
print(df_train_sel.head())

In [308]:
# df_train_sel.tokens = df_train_sel.tokens[0]
# df_val_sel.tokens = df_val_sel.tokens[0]
# df_test_sel.tokens = df_test_sel.tokens[0]

In [309]:
# df_train_sel.tokens = df_train_sel.tokens.apply(lambda x: x[0])
# df_val_sel.tokens = df_val_sel.tokens.apply(lambda x: x[0])
# df_test_sel.tokens = df_test_sel.tokens.apply(lambda x: x[0])

In [310]:
df_train_sel.tokens = df_train_sel.tokens.apply(lambda x: x[0])
# df_val_sel.tokens = df_val_sel.tokens.apply(lambda x: x[0])
# df_test_sel.tokens = df_test_sel.tokens.apply(lambda x: x[0])
df_train_sel.head()

In [311]:
df_train_sel = df_train_sel.astype(str)
# df_val_sel = df_val_sel.astype(str)
# df_test_sel = df_test_sel.astype(str)

In [312]:
df_train_concat = df_train_sel.groupby(df_train_sel.index).agg(lambda x: ' '.join(x))
# df_val_concat = df_val_sel.groupby(df_val_sel.index).agg(lambda x: ' '.join(x))
# df_test_concat = df_test_sel.groupby(df_test_sel.index).agg(lambda x: ' '.join(x))

In [313]:
df_train_concat

In [314]:
df_train_concat.tokens = df_train_concat.tokens.apply(lambda x: x.split(' '))
df_train_concat.ner_tags = df_train_concat.ner_tags.apply(lambda x: x.split(' '))
# df_val_concat.tokens = df_val_concat.tokens.apply(lambda x: x.split(' '))
# df_val_concat.ner_tags = df_val_concat.ner_tags.apply(lambda x: x.split(' '))
# df_test_concat.tokens = df_test_concat.tokens.apply(lambda x: x.split(' '))

In [315]:
print('Training Set Shape after Concatenating Sentences: ', df_train_concat.shape)
# print('Validation Set Shape after Concatenating Sentences: ', df_val_concat.shape)
# print('Test Set Shape after Concatenating Sentences: ', df_test_concat.shape)
print(df_train_concat.head())

In [316]:
# df_train_concat['length'] = df_train_concat.tokens.apply(lambda x: len(x))
# df_val_concat['length'] = df_val_concat.tokens.apply(lambda x: len(x))

In [317]:
# df_train_concat.head()

In [318]:
# ## Sorting training and validation dataset
# df_train_concat = df_train_concat.sort_values('length')
# df_val_concat = df_val_concat.sort_values('length')
# df_train_concat.head()

In [319]:
# df_train_concat.drop('length', axis=1, inplace=True)
# df_val_concat.drop('length', axis=1, inplace=True)
# df_train_concat

In [320]:
train_sentences = df_train_concat.tokens
train_tags = df_train_concat.ner_tags

In [321]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(train_sentences, train_tags, test_size=0.1, random_state=1)
print(train_sentences.shape)
print(test_sentences.shape)

In [322]:
train_sentences.iloc[0][:10]

In [323]:
train_sentences = [list(map(int, sentence)) for sentence in train_sentences]
train_tags = [list(map(int, sentence)) for sentence in train_tags]
test_sentences = [list(map(int, sentence)) for sentence in test_sentences]
test_tags = [list(map(int, sentence)) for sentence in test_tags]

In [324]:
train_sentences[0][:10]

In [331]:
# Your code here

# Helper Code
batch_size = 4
train_shuffle_buffer_size = len(train_sentences)
validation_shuffle_buffer_size = len(test_sentences)

# Fill the required cells to complete the function
def transform_pad(input, output):
    
    # Pad the inputs
    input  = input.to_tensor(default_value=0, shape=[None, None])
#     input = tf.reverse(input, axis=[-1])
    
    # Pad the outputs
    output = output.to_tensor(default_value=0, shape=[None, None])
#     output = tf.reverse(output, axis=[-1])
    
    return input, output

# Use tensorflow ragged constants to get the ragged version of data
# train_sentences = tf.keras.preprocessing.sequence.pad_sequences(train_sentences, padding="pre")
# train_tags = tf.keras.preprocessing.sequence.pad_sequences(train_tags, padding="pre")
# test_sentences = tf.keras.preprocessing.sequence.pad_sequences(test_sentences, padding="pre")
# test_tags = tf.keras.preprocessing.sequence.pad_sequences(test_tags, padding="pre")

train_processed_x = tf.ragged.constant(train_sentences)
validate_processed_x = tf.ragged.constant(test_sentences)
train_processed_y = tf.ragged.constant(train_tags)
validate_processed_y = tf.ragged.constant(test_tags)

# Create TF Dataset
train_data = tf.data.Dataset.from_tensor_slices((train_processed_x, train_processed_y))
validation_data = tf.data.Dataset.from_tensor_slices((validate_processed_x, validate_processed_y))

#############
# Train data
#############
# Apply all data processing logic
train_data = train_data.shuffle(buffer_size=train_shuffle_buffer_size)
train_data = train_data.batch(batch_size)
train_data = train_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
train_data = train_data.prefetch(tf.data.AUTOTUNE)

##################
# Validation data
##################
# Apply all data processing logic
#validation_data = validation_data.shuffle(buffer_size=validation_shuffle_buffer_size)
validation_data = validation_data.batch(batch_size)
validation_data = validation_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
validation_data = validation_data.prefetch(tf.data.AUTOTUNE)

print("train_data", train_data)
print("validation_data", validation_data)

In [332]:
# # Your code here

# # Helper Code
# batch_size = 4
# train_shuffle_buffer_size = len(train_sentences)
# validation_shuffle_buffer_size = len(test_sentences)

# # Pad the data.
# def transform_pad(input):
#     input  = input.to_tensor(default_value=0, shape=[None, None])
#     return input

# # Use tensorflow ragged constants to get the ragged version of data
# train_processed_x = tf.ragged.constant(train_sentences)
# validate_processed_x = tf.ragged.constant(test_sentences)
# train_processed_y = tf.ragged.constant(train_tags)
# validate_processed_y = tf.ragged.constant(test_tags)

# # Create TF Dataset
# x_train = tf.data.Dataset.from_tensor_slices(train_processed_x)
# y_train = tf.data.Dataset.from_tensor_slices(train_processed_y)
# x_val = tf.data.Dataset.from_tensor_slices(train_processed_x)
# y_val = tf.data.Dataset.from_tensor_slices(train_processed_y)

# #############
# # Train data
# #############
# # Apply all data processing logic
# train_data = train_data.shuffle(buffer_size=train_shuffle_buffer_size)
# train_data = train_data.batch(batch_size)
# train_data = train_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
# train_data = train_data.prefetch(tf.data.AUTOTUNE)

# ##################
# # Validation data
# ##################
# # Apply all data processing logic
# #validation_data = validation_data.shuffle(buffer_size=validation_shuffle_buffer_size)
# validation_data = validation_data.batch(batch_size)
# validation_data = validation_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
# validation_data = validation_data.prefetch(tf.data.AUTOTUNE)

# print("train_data", train_data)
# print("validation_data", validation_data)

In [333]:
# View some data from tf dataset
for input_f, output_f in train_data.take(2):
  print(input_f.shape)
  print(input_f[0])
#   print(input_b[0])
  print("************************")
  print(output_f.shape)
  print(output_f[0])
#   print(output_b[0])

In [334]:
# Your code here

hidden_size_LSTM = 150
hidden_size_Dense = 150

def build_model():

  ## ---------------------------------------------------------------------------
  ## FORWARD LAYER -------------------------------------------------------------
  ## ---------------------------------------------------------------------------

  ## Define input layer.
  inputs_f = tf.keras.Input(shape=[None])


  ## Embedding Layer for forward.
  embedding_layer_f = tf.keras.layers.Embedding(input_dim=2000,
                                                output_dim=300,
                                                # weights=[embedding_matrix],
                                                # input_length=max_length,
                                                trainable=True,
                                                mask_zero=True)

  ## Create a forward LSTM.
  RNN1_layer_f = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True)


  ## Create a dense layer for simulating the highway layer rather than using it here.
  dense_layer_f = tf.keras.layers.Dense(units=hidden_size_Dense, activation='linear', use_bias=False)


  ## Create an additive layer.
  additive_layer_f = tf.keras.layers.Add()


  ## Create second forward LSTM.
  RNN2_layer_f = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True)


  ## Pass Inputs ---------------------------------------------------------------

  embedding_f = embedding_layer_f(inputs_f)   ## Get forward and backward embeddings.
  r_f = RNN1_layer_f(embedding_f)             ## Get LSTM outputs.
  z_f = dense_layer_f(embedding_f)            ## Get Dense layer outputs.
  # h_int_f = additive_layer_f([r_f, z_f])      ## Get addition of LSTM and Dense Layer outputs.
  h_f = RNN2_layer_f(r_f + z_f)                 ## Get LSTM2 outputs.
  


#   ## ---------------------------------------------------------------------------
#   ## BACKWARD LAYER ------------------------------------------------------------
#   ## ---------------------------------------------------------------------------

#   ## Define input layer.
#   inputs_b = tf.keras.Input(shape=[None])


#   ## Embedding Layer for forward.
#   embedding_layer_b = tf.keras.layers.Embedding(input_dim=2000,
#                                                 output_dim=300,
#                                                 # weights=[embedding_matrix],
#                                                 # input_length=max_length,
#                                                 trainable=True,
#                                                 mask_zero=True)

#   ## Create a forward LSTM.
#   RNN1_layer_b = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True, go_backwards=True)


#   ## Create a dense layer for simulating the highway layer rather than using it here.
#   dense_layer_b = tf.keras.layers.Dense(units=hidden_size_Dense, activation=None, use_bias=False)


#   ## Create an additive layer.
#   additive_layer_b = tf.keras.layers.Add()


#   ## Create second forward LSTM.
#   RNN2_layer_b = tf.keras.layers.LSTM(hidden_size_LSTM, return_sequences=True, go_backwards=True)


#   ## Pass Inputs ------------------------------------------------------------

#   embedding_b = embedding_layer_b(inputs_b)   ## Get forward and backward embeddings.
#   r_b = RNN1_layer_b(embedding_b)             ## Get LSTM outputs.
#   r_b = r_b[:,::-1,:]
#   z_b = dense_layer_b(embedding_b)            ## Get Dense layer outputs.
#   # h_int_b = additive_layer_b([r_b, z_b])      ## Get addition of LSTM and Dense Layer outputs.
#   h_b = RNN2_layer_b(r_b + z_b)                 ## Get LSTM2 outputs.
#   h_b = h_b[:,::-1,:]



  ## ---------------------------------------------------------------------------
  ## For Toy_ELMo LAYER --------------------------------------------------------
  ## ---------------------------------------------------------------------------

  # embeddings_f is also needed to be sent out.
#   r_fb = tf.keras.layers.Concatenate(axis=-1)([r_f, r_b])   ## for Toy_ELMo
#   h_fb = tf.keras.layers.Concatenate(axis=-1)([h_f, h_b])   ## for Toy_ELMo


  ## ---------------------------------------------------------------------------
  ## For Outputs ---------------------------------------------------------------
  ## ---------------------------------------------------------------------------


  ## Create Softmax Layer.
  softmaxLayer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=2000, activation='softmax', name='softmaxLayer'))

  output_f = softmaxLayer(h_f)
#   output_b = softmaxLayer(h_b)


  ## ---------------------------------------------------------------------------
  ## Setup the Outputs ---------------------------------------------------------
  ## ---------------------------------------------------------------------------

  ## Set up the model with appropriate inputs and the output defined above 
  model = tf.keras.Model(inputs=inputs_f, outputs=output_f, name='Model')
  
  ## Creating a Toy_ELMo model for prediction.
#   Toy_ELMo_model = tf.keras.Model(inputs=(inputs_f, inputs_b), outputs=[embedding_f, r_fb, h_fb], name='Toy_ELMo')


  return model

In [335]:
############################
# Training Params
############################

import time

learning_rate = 5e-4
epochs = 10

# Free up memory
tf.keras.backend.clear_session()

# Build the model
model = build_model()

# Print the model architecture
print(model.summary())

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Loss
loss = tf.keras.losses.sparse_categorical_crossentropy

# Callbacks
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
    tf.keras.callbacks.ModelCheckpoint(filepath='model_2.{epoch:02d}-{loss:.2f}.h5'),
]

# Compile
model.compile(loss=[loss, loss],
              loss_weights=[1, 1],    ## add both the losses with equal weights
              optimizer=optimizer,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])

dot_img_file = '/tmp/model.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=False)

In [336]:
# Train model
start_time = time.time()

training_results = model.fit(
        train_data,
        epochs=epochs, 
        verbose=1,
        validation_data=validation_data,
        callbacks=my_callbacks)

execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

In [None]:
# Plot the trace plot of the loss of the model
plt.plot(training_results.history['loss'], label='Train')
plt.plot(training_results.history['val_loss'], label='Validation')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc=0)
plt.show()

In [197]:
# Plot the trace plot of the loss of the model
plt.plot(training_results.history['sparse_categorical_accuracy'], label='Train')
plt.plot(training_results.history['val_sparse_categorical_accuracy'], label='Validation')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(loc=0)
plt.show()

In [106]:
# # Plot the trace plot of the accuracy of the predictions made by the model
# average_accuracy = training_results.history['sparse_categorical_accuracy']
# average_val_accuracy = np.add(training_results.history['val_softmaxLayer_sparse_categorical_accuracy'], training_results.history['val_softmaxLayer_1_sparse_categorical_accuracy'])/2
# plt.plot(average_accuracy, label='Train')
# plt.plot(average_val_accuracy, label='Validation')
# plt.ylabel('Accuracy')
# plt.xlabel('Epochs')
# plt.legend(loc=0)
# plt.show()

In [107]:
# Your code here

# Helper Code
batch_size = len(test_sentences)
# train_shuffle_buffer_size = len(test_sentences)
# validation_shuffle_buffer_size = len(test_sentences)

# Fill the required cells to complete the function
def transform_pad(input, output):
    
    # Pad the inputs
    input  = input.to_tensor(default_value=0, shape=[None, None])
    
    # Pad the outputs
    output = output.to_tensor(default_value=0, shape=[None, None])
    
    return input, output

# Use tensorflow ragged constants to get the ragged version of data
test_processed_x = tf.ragged.constant(test_sentences)
test_processed_y = tf.ragged.constant(test_tags)

# Create TF Dataset
test_data = tf.data.Dataset.from_tensor_slices((test_processed_x, test_processed_y))

#############
# Test data
#############
# Apply all data processing logic
# train_data = train_data.shuffle(buffer_size=train_shuffle_buffer_size)
test_data = test_data.batch(batch_size)
test_data = test_data.map(transform_pad, num_parallel_calls=tf.data.AUTOTUNE)
test_data = test_data.prefetch(tf.data.AUTOTUNE)

print("test_data", test_data)

In [108]:
batch_size

In [109]:
# View some data from tf dataset
for input_f, output_f in test_data.take(1):
  print(input_f.shape)
  print(input_f[0])
#   print(input_b[0])
  print("************************")
  print(output_f.shape)
  print(output_f[0])
#   print(output_b[0])

In [110]:
test_data

In [337]:
predicted_tags = model.predict(test_data).argmax(axis=-1)

In [360]:
predicted_tags[0]

In [339]:
predicted_tags_curtailed = []
for i in range(len(predicted_tags)):
    predicted_tags_curtailed.append(predicted_tags[i][:len(test_tags[i])])

In [147]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

In [345]:
len(predicted_tags_curtailed[4])

In [None]:
len(test_tags[4])

In [138]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [348]:
a = [64,62,1,2]
b = [64,64,2,2]

In [349]:
classification_report(a,b)

In [354]:
pip install seqeval

In [355]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

In [359]:
test_tags[1][-10:]

In [357]:
classification_report(test_tags, predicted_tags_curtailed)

In [350]:
print(classification_report(test_tags, predicted_tags_curtailed))

In [352]:
test_tags[0]