In [None]:
# TensorFlow is an end-to-end open source platform for machine learning. It has a comprehensive, flexible ecosystem of tools, libraries and community
import tensorflow as tf

In [None]:
# Embedding Layer, it is a vector and it have input dimension(1000) and output dimension(5)
from tensorflow.keras import layers

embedding_layer = layers.Embedding(1000, 5)

In [None]:
# NumPy is used to work with arrays. The array object in NumPy is called ndarray.
from numpy import array
import tensorflow as tf
# A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models.
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.
from tensorflow.keras.models import  Sequential
# LSTM - Long Short-Term Memory layer
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional

In [None]:
# Uploading the train dataset
from google.colab import files

uploaded = files.upload()

Saving train.txt to train.txt


In [None]:
# Uploading the dev dataset
from google.colab import files

uploaded = files.upload()

Saving dev.txt to dev.txt


In [None]:
# Declared two lists train_data and test_data and also reading mode
train_data = []
test_data = []

with open('train.txt', 'r') as train_file:
    for line in train_file:
        train_data.append(line)

with open('dev.txt', 'r') as test_file:
    for line in test_file:
        test_data.append(line)

In [None]:
# Separating labels and examples (documents) from train data and test (dev) data 
train_documents = []
train_labels = []
test_documents = []
test_labels = []

for line in train_data:
    """ Spliting the whole document in each line when it gets first white space """
    splitted_line = line.split(' ',1)
    # Separate the labels and examples (documents) in different list
    train_labels.append(splitted_line[0])
    train_documents.append(splitted_line[1])
# print(splitted_line)

for line in test_data:
    """ Spliting the whole document in each line when it gets first white space"""
    splitted_line = line.split(' ',1)
    # Separate the labels and examples (documents) in different list
    test_labels.append(splitted_line[0])
    test_documents.append(splitted_line[1])
# print(splitted_line)
# print(train_labels)
# print(train_documents)
# print(test_labels)
# print(test_documents)

In [None]:
# Declaring a dictionary where the labels into integer numbers which start from (oth position to 5th position)
label_dictionary = {'"বিস্ময়"':0,
                    '"ভয়"':1,
                    '"রাগ"':2,
                    '"বিতৃষ্ণা"':3,
                    '"বিষণ্ণতা"':4,
                    '"সুখী"':5}
# label_dictionary['রাগ']

In [None]:
# Tokenization and Converting Words into Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_documents)
dense_train_doc = tokenizer.texts_to_sequences(train_documents)
dense_test_doc = tokenizer.texts_to_sequences(test_documents)

In [None]:
# Padding the Training Documents in order to make them equal length
# MAX_LENGTH = len(max(dense_train_doc, key=len))
MAX_LENGTH = max([len(i) for i in dense_train_doc])

# Padding the train documents
padded_train_doc = pad_sequences(dense_train_doc, maxlen=MAX_LENGTH, padding='post')
print(padded_train_doc)
# Padding the test documents
padded_test_doc = pad_sequences(dense_test_doc, maxlen=MAX_LENGTH, padding='post')
print(padded_test_doc)

[[4364 1620  399 ...    0    0    0]
 [ 888    3  414 ...    0    0    0]
 [6784 4391 6785 ...    0    0    0]
 ...
 [  16  977 1527 ...    0    0    0]
 [  31    9   72 ...    0    0    0]
 [ 925 1196  925 ...    0    0    0]]
[[  26   29 1068 ...    0    0    0]
 [ 265  253  684 ...    0    0    0]
 [ 369  989 6265 ...    0    0    0]
 ...
 [  10  637  531 ...    0    0    0]
 [  43 3568   11 ...    0    0    0]
 [1516   31 1621 ...    0    0    0]]


In [None]:
# Model Declaration
VOCUB_SIZE = max([max(i) for i in dense_train_doc])+1
model = Sequential()

# Embedding Layer where input dimension is assigned as vocubulary size, and length is equal of maximum length in dense_train_doc
embedding_layer = Embedding(input_dim=VOCUB_SIZE, output_dim=20, input_length=MAX_LENGTH)
model.add(embedding_layer)

# LSTM - for better performance,NLP
# Bidirectional LSTM [Extra] Return sequence false because we don't want to print it
forward_layers = LSTM(units=128, return_sequences=False)
backward_layers = LSTM(units=128, return_sequences=False, go_backwards=True)
model.add(Bidirectional(layer=forward_layers, backward_layer=backward_layers))

# Output layer added softmax as activation function
model.add(Dense(units=6, activation='softmax'))

# Using Nadam optimizer and CategoricalCrossentropy as loss function to compile the model
model.compile(optimizer='Nadam', loss='categorical_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 584, 20)           345240    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               152576    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1542      
Total params: 499,358
Trainable params: 499,358
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
import numpy as np
# Convert the levels into Binary matrix

train_lbls = [label_dictionary[i] for i in train_labels]
print(train_lbls)
train_labels = tf.keras.utils.to_categorical(train_lbls)
print(train_labels)

test_lbls = [label_dictionary[i] for i in test_labels]
# print(test_lbls)
test_labels = tf.keras.utils.to_categorical(test_lbls)
print(test_labels)

[4, 4, 2, 5, 2, 4, 3, 2, 0, 4, 1, 2, 3, 3, 2, 0, 2, 1, 5, 0, 3, 2, 5, 2, 1, 3, 1, 4, 4, 5, 4, 5, 5, 2, 5, 4, 1, 0, 0, 0, 5, 4, 3, 0, 5, 0, 5, 5, 4, 5, 3, 5, 5, 2, 2, 3, 3, 2, 1, 0, 5, 5, 1, 2, 5, 0, 3, 3, 3, 4, 3, 2, 2, 4, 2, 2, 0, 1, 2, 3, 3, 0, 2, 2, 3, 1, 3, 0, 3, 2, 3, 2, 4, 5, 5, 4, 3, 4, 5, 5, 0, 2, 1, 4, 0, 0, 4, 0, 1, 4, 3, 2, 2, 5, 3, 4, 0, 4, 1, 5, 4, 5, 3, 4, 2, 1, 4, 2, 1, 4, 4, 0, 3, 2, 4, 2, 5, 0, 2, 2, 5, 3, 4, 5, 4, 2, 5, 4, 5, 2, 2, 4, 4, 2, 4, 4, 0, 1, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 5, 2, 2, 0, 0, 2, 2, 5, 2, 4, 0, 1, 4, 4, 4, 4, 2, 2, 0, 4, 0, 2, 0, 0, 2, 5, 4, 4, 4, 2, 4, 2, 2, 3, 4, 0, 2, 0, 0, 0, 2, 4, 2, 4, 0, 4, 0, 5, 5, 5, 5, 2, 5, 2, 3, 1, 0, 0, 0, 1, 4, 4, 0, 2, 3, 1, 4, 4, 2, 1, 3, 0, 3, 1, 4, 2, 2, 3, 1, 0, 2, 2, 2, 0, 4, 4, 0, 2, 1, 5, 2, 4, 1, 2, 0, 2, 0, 0, 4, 4, 2, 2, 1, 4, 1, 4, 0, 4, 4, 2, 3, 0, 4, 2, 2, 4, 4, 1, 1, 4, 0, 2, 4, 3, 3, 0, 4, 4, 4, 2, 4, 0, 1, 0, 2, 4, 0, 2, 4, 2, 4, 2, 4, 4, 1, 4, 2, 1, 0, 

In [None]:
model.fit(padded_train_doc, train_labels, epochs=1000, verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x7ff056c9bed0>

In [None]:
padded_test_doc = pad_sequences(dense_test_doc, maxlen=MAX_LENGTH, padding='post')
print(padded_test_doc)

[[  26   29 1068 ...    0    0    0]
 [ 265  253  684 ...    0    0    0]
 [ 369  989 6265 ...    0    0    0]
 ...
 [  10  637  531 ...    0    0    0]
 [  43 3568   11 ...    0    0    0]
 [1516   31 1621 ...    0    0    0]]


In [None]:
class_type = {0:'"বিস্ময়"',
              1:'"ভয়"',
              2:'"রাগ"',
              3:'"বিতৃষ্ণা"',
              4:'"বিষণ্ণতা"',
              5:'"সুখী"'}
ynew = np.argmax(model.predict(padded_test_doc), axis=-1)
print(ynew)
for indx, doc in enumerate(test_documents):
  print("Test Doc - {}: Prediction = {} ({}) -- {}".format(indx, ynew[indx], class_type[ynew[indx]], doc))

[3 4 2 1 5 4 2 5 3 4 4 4 0 5 0 5 0 1 5 5 0 4 0 4 4 1 2 0 4 5 4 4 3 4 1 1 0
 4 5 3 4 4 4 1 1 4 0 2 2 3 4 2 5 4 5 4 2 4 4 2 4 5 0 2 3 4 2 2 2 2 2 5 0 5
 4 5 4 2 3 2 2 5 2 5 2 0 4 3 2 4 2 2 2 4 1 5 5 4 5 4 2 5 2 4 4 5 5 2 5 3 2
 5 4 1 5 2 4 4 4 2 2 4 0 4 4 2 5 2 2 2 4 2 0 2 2 2 3 3 2 0 0 4 2 3 2 4 2 4
 2 4 5 5 5 2 0 2 2 2 2 5 1 2 4 2 4 0 2 4 2 4 2 2 5 2 1 5 2 4 0 3 1 4 5 0 0
 4 2 5 0 2 5 0 1 3 1 1 5 5 4 0 5 5 4 3 2 5 1 0 3 0 0 1 0 4 5 5 0 4 5 2 2 2
 5 2 0 0 5 1 0 5 4 4 2 5 2 4 2 2 0 2 0 0 2 2 4 4 5 0 2 3 4 3 5 4 1 5 2 4 2
 2 5 5 3 4 0 5 3 4 4 4 2 5 0 3 4 3 4 2 4 4 2 5 2 5 2 4 3 2 4 4 5 5 0 0 5 4
 2 0 0 4 4 3 5 0 5 1 2 4 5 2 5 4 0 3 4 5 4 2 2 2 4 5 4 0 4 5 2 4 0 2 4 5 3
 5 3 0 0 4 2 2 5 0 2 4 5 2 0 2 5 1 3 4 2 0 4 2 2 5 2 2 0 2 5 2 3 4 1 3 4 4
 3 2 4 4 2 2 3 2 2 5 3 2 2 3 0 4 4 3 5 2 0 4 2 5 2 5 4 4 5 4 2 5 1 2 5 0 5
 2 2 2 0 0 4 2 2 4 5 5 2 4 2 2 2 1 4 4 0 3 1 5 5 5 2 4 2 0 2 5 5 2 2 2 4 0
 4 2 4 2 5 5 0 2 5 3 0 2 2 5 2 2 5 5 0 4 5 4 4 5 0 5 1 2 2 5 3 4 2 0 5 5 5
 4 3 2 2 2 2 5 2 1 1 1 3 