In [1]:
import numpy as np
import pandas as pd 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, TimeDistributed
from tensorflow.keras.layers import BatchNormalization, Bidirectional, LSTM, concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from datasets import load_dataset
import json
import pickle
import fasttext
import fasttext.util
import tempfile

2022-11-27 08:39:49.494081: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filepath='./Data/xnli_hi_train.json'

In [3]:
def generate_dataset(filepath):
    premise = []
    hypothesis = []
    label = []
    with open(filepath, "r") as f:
        data = json.load(f)
        data = data[list(data.keys())[0]]
        #print(data)
    for idx, row in enumerate(data):
        premise.append(row["premise"])
        hypothesis.append(row["hypothesis"])
        label.append(row["label"])
    df = pd.DataFrame(list(zip(premise, hypothesis,label)),
               columns =['premise', 'hypothesis','label'])
    return df

In [4]:
data = generate_dataset(filepath)

In [5]:
data.head()

Unnamed: 0,premise,hypothesis,label
0,अवधारणात्मक रूप से क्रीम स्किमिंग के दो बुनिया...,उत्पाद और भूगोल क्रीम स्किमिंग का काम करते हैं।,1
1,आप मौसम के दौरान पता है और मुझे लगता है कि अपन...,अगर लोग याद करते हैं तो आप निम्नलिखित स्तर पर ...,0
2,हमारी संख्या में से एक आपके निर्देशों का बारीक...,मेरी टीम का एक सदस्य आपके आदेशों को बहुत सटीकत...,0
3,तुम्हें कैसे पता? यह सब फिर से उनकी जानकारी है.,यह जानकारी उनके पास है।,0
4,हाँ मैं आपको बताती हूँ कि अगर आप उन टेनिस जूतो...,टेनिस जूतों की कीमतों की एक श्रृंखला है।,1


In [6]:
def preprocess(data):
    # Get the sentences and labels from composite data
    list_premise = data['premise']
    list_hypothesis = data['hypothesis']
    list_label = data['label']
    # Merge each sublist (tokens list of each sentence) to a string
    corpus_premise = [''.join(item) for item in list_premise]
    corpus_hypothesis = [''.join(item) for item in list_hypothesis]
    num_samples = len(list_label)
    labels = np.array(list_label)
    corpus = [corpus_premise[ind] + " " + corpus_hypothesis[ind] for ind in range(len(labels))]
    
    return corpus_premise, corpus_hypothesis, labels, corpus

In [7]:
premise_list, hypothesis_list, labels_list, corpus = preprocess(data)

In [8]:
tokenizer = Tokenizer()
# tokenizer.fit_on_texts(corpus)

# # Save the tokenizer as a pickle file so that the same tokenizer (word-integer)
# # mapping can be used during testing time
# with open('./tokenizer.pickle', "wb") as file:
#     pickle.dump(tokenizer, file)

In [9]:
EMBEDDING_DIM = 300 #FastText output dimensions for each wordvector

In [10]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
ft = fasttext.load_model('Data/cc.hi.300.bin')



In [11]:
for word, ind in word_index.items():
    # Get the embedding vector from FastText ouput, if available
    embedding_vector = ft.get_word_vector(word)
#     print(embedding_vector)

    if embedding_vector is not None:
        embedding_matrix[ind] = embedding_vector

In [12]:
MAX_SEQ_LEN = 281
sequence = lambda sentence: pad_sequences(tokenizer.texts_to_sequences(sentence), maxlen=MAX_SEQ_LEN)
process = lambda item: (sequence(item[0]), sequence(item[1]), to_categorical(item[2]))

training_data = process([premise_list, hypothesis_list, labels_list])
print(len(word_index) + 1)

1


In [13]:

LSTM_UNITS = 64

VOCAB_SIZE = 81649
EMBEDDING_HIDDEN_SIZE = 300
SENT_HIDDEN_SIZE = 300
TRAIN_EMBED = False

L2 = 4e-6
ACTIVATION = 'relu'
DROPOUT = 0.2
LEARNING_RATE = 0.01
RHO = 0.9
EPSILON = 1e-08
DECAY = 0.0

CATEGORIES = 3
BATCH_SIZE = 512
TRAINING_EPOCHS = 10
VALIDATION_SPLIT = 0.02

PATIENCE = 4


def BiLSTM(premise_list, hypothesis_list, labels_list, embedding_matrix):
    #Embedding layer with obtained embedding matrix
    # Define the embedding layer with the obtained weight matrix
    embedding = Embedding(input_dim = embedding_matrix.shape[0], output_dim = EMBEDDING_HIDDEN_SIZE, weights = [embedding_matrix], input_length = MAX_SEQ_LEN, trainable = TRAIN_EMBED)
    BiLSTM = Bidirectional(LSTM(LSTM_UNITS)) #BiLSTM Layer
    translation = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION)) #Time Distributed Layer to increase performance
    
    # Defining the input layers and its shapes for premise and hypothesis
    premise = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
    hypothesis = Input(shape=(MAX_SEQ_LEN,), dtype='int32')

    # Embed the premise and hypothesis
    premise_embedded = embedding(premise)
    hypothesis_embedded = embedding(hypothesis)

    # Apply the translation layer
    premise_translated = translation(premise_embedded)
    hypothesis_translated = translation(hypothesis_embedded)

    # Apply the bidirectional LSTM layer
    premise_BiLSTM = BiLSTM(premise_translated)
    hypothesis_BiLSTM = BiLSTM(hypothesis_translated)
    
    # Apply Batch normalization
    premise_normalized = BatchNormalization()(premise_BiLSTM)
    hypothesis_normalized = BatchNormalization()(hypothesis_BiLSTM)

    # Concatenate the normalized premise and hypothesis and apply a dropout layer
    train_input = concatenate([premise_normalized, hypothesis_normalized])
    train_input = Dropout(DROPOUT)(train_input)
    
    # Apply the (Dense layer, Dropout layer. Batch normalization layer) unit : 1
    train_input = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, kernel_regularizer=l2(L2))(train_input)
    train_input = Dropout(DROPOUT)(train_input)
    train_input = BatchNormalization()(train_input)

    # Apply the (Dense layer, Dropout layer. Batch normalization layer) unit : 2
    train_input = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, kernel_regularizer=l2(L2))(train_input)
    train_input = Dropout(DROPOUT)(train_input)
    train_input = BatchNormalization()(train_input)

    # Apply the (Dense layer, Dropout layer. Batch normalization layer) unit : 3
    train_input = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, kernel_regularizer=l2(L2))(train_input)
    train_input = Dropout(DROPOUT)(train_input)
    train_input = BatchNormalization()(train_input)

    # Define the output Dense layer
    prediction = Dense(CATEGORIES, activation='softmax')(train_input)

    # Define the complete model
    model = Model(inputs=[premise, hypothesis], outputs=prediction)

    # Choosing an optimizer
    optimizer = RMSprop(lr=LEARNING_RATE, rho=RHO, epsilon=EPSILON, decay=DECAY)
    
    # Compile the model and print out the model summary
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    print("Training model")

    # ReduceLROnPlateau callback to reduce learning rate when the validation accuracy plateaus
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy',
                                                patience=PATIENCE,
                                                verbose=1,
                                                factor=0.5,
                                                min_lr=0.00001)

    # Early stopping callback to stop training if we are not making any positive progress
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=PATIENCE)

    # ModelCheckpoint callback to save the model with best performance
    # A temporary file is created to which the intermediate model weights are stored
    _, tmpfn = tempfile.mkstemp()
    model_checkpoint = ModelCheckpoint(tmpfn, save_best_only=True, save_weights_only=True)

    callbacks = [early_stopping, model_checkpoint, learning_rate_reduction]
    
    # Train the model
    history = model.fit(x=[np.array(premise_list), np.array(hypothesis_list)], y=labels_list, batch_size=BATCH_SIZE, epochs=TRAINING_EPOCHS, validation_split=VALIDATION_SPLIT, callbacks=callbacks)

    # Restore the best found model during validation
    model.load_weights(tmpfn)

   
    # Uncomment for generating plots.
#     plot(history, "BiLSTM")

    # Save the model as h5 file
    model.save("./model/BiLSTM.h5")




In [14]:
# Uncomment to train the model
# BiLSTM(training_data[0], training_data[1], training_data[2], embedding_matrix)

In [15]:
testfilepath='./Data/xnli_hi_test.json'

In [16]:
test_data = generate_dataset(testfilepath)

In [17]:
test_data.head()

Unnamed: 0,premise,hypothesis,label
0,"खैर, मैं उस बारे में सोच भी नहीं रहा था, लेकिन...",मैंने फिर उससे बात नहीं की।,2
1,"खैर, मैं उस बारे में सोच भी नहीं रहा था, लेकिन...",मैं इतना परेशान था कि मैंने उससे फिर बात करना ...,0
2,"खैर, मैं उस बारे में सोच भी नहीं रहा था, लेकिन...",हमारी बहुत अच्छी बातचीत हुई।,1
3,"और मैंने सोचा कि यह एक विशेषाधिकार था, और यह अ...",मुझे नहीं पता था कि मैं अकेला ऐसा व्यक्ति नहीं...,1
4,"और मैंने सोचा कि यह एक विशेषाधिकार था, और यह अ...","""उन्होंने कहा,"" ""मुझे लगा था कि मैं एकमात्र ऐस...",0


In [18]:
test_premise_list, test_hypothesis_list, test_labels_list, test_corpus = preprocess(test_data)

In [19]:
with open('./tokenizer.pickle', "rb") as file:
    tokenizer = pickle.load(file)

# Process the data to integer sequences and labels to one-hot labels
sequence = lambda sentence: pad_sequences(tokenizer.texts_to_sequences(sentence), maxlen=MAX_SEQ_LEN)
process = lambda item: (sequence(item[0]), sequence(item[1]), to_categorical(item[2]))

test_data = process([test_premise_list, test_hypothesis_list, test_labels_list])

In [20]:
output_file = open('./results/BiLSTM.txt', 'w')

In [21]:
from tensorflow.keras.models import load_model
model = load_model('./model/BiLSTM.h5')

2022-11-27 08:40:15.825299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
loss, accuracy = model.evaluate(x=[test_data[0], test_data[1]], y=test_data[2], batch_size=BATCH_SIZE)
print("Test Loss: {:.2f}, Test Accuracy: {:.2f}%\n".format(loss, (accuracy*100)))

# Obtain the predicted classes
Y_pred = model.predict([test_data[0], test_data[1]])
Y_pred = np.argmax(Y_pred, axis=1)
Y_test = np.argmax(test_data[2], axis=1)

Test Loss: 0.82, Test Accuracy: 64.09%



In [23]:
for index in range(Y_pred.shape[0]):
    if Y_pred[index] == 0:
        output_file.write("Entailment\n")
    elif Y_pred[index] == 1:
        output_file.write("Neutral\n")
    elif Y_pred[index] == 2:
        output_file.write("Contradiction\n")
    else:
        pass

output_file.close()

In [24]:
# import seaborn as sns
# import matplotlib.pyplot as plt


# from utils.plot_confusion_matrix import plot_confusion_matrix
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from tensorflow.keras.utils import plot_model

In [25]:
# confusion_mtx = confusion_matrix(Y_test, Y_pred)
# plot_confusion_matrix(confusion_mtx, "", classes=range(3))

# target_names = ["Class {}".format(i) for i in range(CATEGORIES)]
# classification_rep = classification_report(Y_test, Y_pred, target_names=target_names, output_dict=True)

# plt.figure()
# sns.heatmap(pd.DataFrame(classification_rep).iloc[:-1, :].T, annot=True)
# plt.savefig('./results/classification_report.png')
# # plt.show()
# plot_model(model, to_file='./results/model_plot.png', show_shapes=True, show_layer_names=True)

In [26]:
from tensorflow.keras.models import load_model

def predict(premise, hypothesis):
    test_premise_list = [premise]
    test_hypothesis_list = [hypothesis]
    with open('./tokenizer.pickle', "rb") as file:
        tokenizer = pickle.load(file)

    # Process the data to integer sequences and labels to one-hot labels
    sequence = lambda sentence: pad_sequences(tokenizer.texts_to_sequences(sentence), maxlen=MAX_SEQ_LEN)
    process = lambda item: (sequence(item[0]), sequence(item[1]))

    test_data = process([test_premise_list, test_hypothesis_list])
    
    model = load_model('./model/BiLSTM.h5')
    Y_pred = model.predict([test_data[0], test_data[1]])
    Y_pred = np.argmax(Y_pred, axis=1)
    
    for index in range(Y_pred.shape[0]):
        if Y_pred[index] == 0:
            print("Entailment")
        elif Y_pred[index] == 1:
            print("Neutral")
        elif Y_pred[index] == 2:
            print("Contradiction")
        else:
            pass

In [45]:
predict("हम घर जाना चाहते हैं", "हम घर नहीं जाना चाहते")

Contradiction


In [28]:
predict("मुझे क्रिकेट खेलने में मजा आता है", "मुझे क्रिकेट खेलना पसंद है")

Entailment


In [29]:
"""The brown fox sat in front of the fence, The brown fox is near the fence"""
predict("भूरी लोमड़ी बाड़े के सामने बैठ गई", "भूरी लोमड़ी बाड़ के पास है")

Entailment


In [30]:
"""The brown fox sat in front of the fence, The brown fox is happy"""
predict("भूरी लोमड़ी बाड़े के सामने बैठ गई", "भूरी लोमड़ी खुश है")

Contradiction


In [31]:
"""The brown fox sat in front of the fence, The brown fox is far from the fence"""
predict("भूरी लोमड़ी बाड़े के सामने बैठ गई", "भूरी लोमड़ी बाड़ से बहुत दूर है")

Contradiction


In [32]:
# import seaborn as sns
# import matplotlib.pyplot as plt


# from utils.plot_confusion_matrix import plot_confusion_matrix
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from tensorflow.keras.utils import plot_model

In [33]:
# confusion_mtx = confusion_matrix(Y_test, Y_pred)
# plot_confusion_matrix(confusion_mtx, "", classes=range(3))

# target_names = ["Class {}".format(i) for i in range(CATEGORIES)]
# classification_rep = classification_report(Y_test, Y_pred, target_names=target_names, output_dict=True)

# plt.figure()
# sns.heatmap(pd.DataFrame(classification_rep).iloc[:-1, :].T, annot=True)
# plt.savefig('./results/classification_report.png')
# # plt.show()
# plot_model(model, to_file='./results/model_plot.png', show_shapes=True, show_layer_names=True)

In [34]:
# print(Y_test[0:5], Y_pred[0:5])

In [35]:
# from sklearn.metrics import classification_report,confusion_matrix


In [36]:
# print(classification_report(Y_pred,Y_test))

In [37]:
# confusion_matrix(Y_pred,Y_test)

In [38]:
"""The brown fox sat in front of the fence, The brown fox is far from the fence"""
predict("भारत ने 2011 में वर्ल्ड कप जीता", "2011 में भारत को वर्ल्ड कप मिला")

Entailment


In [39]:
predict("मुझे खेलों में दिलचस्पी है", "मेरी दिलचस्पी क्रिकेट में है") #Hypernymy - Hyponymy

Entailment


In [40]:
predict("बराक ओबामा ने भारत का दौरा किया","बराक ओबामा ने मुंबई का दौरा किया") #Meronymy for places not being captured --- should have been neutral

Entailment


In [41]:
predict("माइकल डेल ने कंपनी के लिए एक नई रणनीति की घोषणा की वह है डेल के संस्थापक","माइकल डेल डेल के संस्थापक हैं") #coreference capture

Entailment


In [42]:
predict("हर कर्मचारी को इनकम टैक्स रिटर्न जरूर फाइल करना चाहिए।","कर्मचारी को इनकम टैक्स रिटर्न फाइल करना चाहिए।")  

Entailment


In [43]:
predict("कुछ तोते बाड़ के ऊपर से उड़ गए", "सभी तोते बाड़ के ऊपर से उड़ गए") #Quantifiers handled (Few, all)

Contradiction


In [44]:
predict("राम ने इनकार किया कि उसने रोटी खाई", "राम ने रोटी खाई") #Should have been neutral

Contradiction


In [48]:
predict("तीन महिलाएं घर में खाना बना रही हैं", "पार्क में तीन महिलाएं हैं")

Entailment
