In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn import preprocessing
from bert import bert_tokenization
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

# Install following packages before going forward
# pip install bert-for-tf2
# pip install sentencepiece

TensorFlow Version: 2.4.0
Hub version:  0.10.0


In [7]:
# Load train and val datasets
df_train = pd.read_csv('tamil_train.tsv', sep = "\t")
df_val = pd.read_csv('tamil_dev.tsv', sep = "\t")

In [8]:
# Columns
df_train.columns

Index(['text', 'category'], dtype='object')

In [9]:
# Prepare input text and one hot encoded labels for train and validation sets

unique_labels = list(np.unique(df_train["category"]))

train_x = df_train["text"].values
train_y = df_train["category"].values

le = preprocessing.LabelEncoder()

train_y = le.fit_transform(train_y)
train_y = tf.keras.utils.to_categorical(train_y, num_classes=len(unique_labels), dtype='float32')

val_x = df_val["text"].values
val_y = df_val["category"].values

val_y = le.fit_transform(val_y)
val_y = tf.keras.utils.to_categorical(val_y, num_classes=len(unique_labels), dtype='float32')


print("number of unique labels", len(unique_labels))

number of unique labels 5


In [10]:
# Check unique labels
unique_labels

['Mixed_feelings ', 'Negative ', 'Positive ', 'not-Tamil ', 'unknown_state ']

In [11]:
# Function to create input_ids
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

# Function to create attention masks
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

# Function to create segment ids
def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

# Function to create input_ids, attention_masks, segment_ids for sample
def create_single_input(sentence,MAX_LEN, MAX_SEQ_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

def create_input_array(sentences, MAX_SEQ_LEN):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2, MAX_SEQ_LEN)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [12]:
# MuRIL model layer
muril_layer = hub.KerasLayer("https://tfhub.dev/google/MuRIL/1", trainable=True)

# Create tokenizer
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [13]:
# Create input_ids, attention_masks, segment_ids for training and validation sets with max_seq_len as 128
max_seq_len = 128
train_x = create_input_array(train_x, max_seq_len)
val_x = create_input_array(val_x, max_seq_len)

100%|██████████| 11335/11335 [00:01<00:00, 6271.16it/s]
100%|██████████| 1260/1260 [00:00<00:00, 5982.93it/s]


In [14]:
# Define model function - compile and fit
def model_fit(train_x, train_y, val_x, val_y, max_seq_length, num_epochs, muril_layer):

  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
  
  outputs = muril_layer(dict(input_word_ids = input_word_ids, input_mask = input_mask, input_type_ids = segment_ids))

  x = tf.keras.layers.Dropout(0.2)(outputs["pooled_output"]) # take pooled output layer
  final_output = tf.keras.layers.Dense(5, activation="softmax", name="dense_output")(x)

  model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=final_output)

  model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
  model.fit(train_x, train_y, epochs = num_epochs, batch_size = 32, validation_data = (val_x, val_y), shuffle = True)

  return model

In [15]:
# Set number of epochs
num_epochs = 1

# Get the model object
model = model_fit(train_x, train_y, val_x, val_y, max_seq_len, num_epochs, muril_layer)



In [16]:
# Make predictions
preds = model.predict(val_x)