Imports

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np
import os
import fasttext

Loading Data

In [2]:
df = pd.read_csv("transliteration_data.csv", encoding="utf-8")
df["transliteration_data"] = df.apply(lambda x: '~'.join([str(x.english), str(x.hindi)]), axis=1)
lines = df["transliteration_data"].tolist()

df.head()

Unnamed: 0,english,hindi,transliteration_data
0,hajagiree,हजगिरी,hajagiree~हजगिरी
1,chekaanv,चेकॉव,chekaanv~चेकॉव
2,spinagaarn,स्पिनगार्न,spinagaarn~स्पिनगार्न
3,medal,मेडल,medal~मेडल
4,chetthinaad,चेत्तिनाद,chetthinaad~चेत्तिनाद


Data Prepocessing

In [3]:
english_words = []
hindi_words = []
english_chars = set()
hindi_chars = set()
total_samples = len(lines)

# Process time in words and numerals
for line in range(total_samples):
    english_line = str(lines[line]).split('~')[0]

    # Append '\t' for start of the entity and '\n' to signify end of the entity
    hindi_line = '\t' + str(lines[line]).split('~')[1] + '\n'
    english_words.append(english_line)
    hindi_words.append(hindi_line)

    for ch in english_line:
        if ch not in english_chars:
            english_chars.add(ch)

    for ch in hindi_line:
        if ch not in hindi_chars:
            hindi_chars.add(ch)

hindi_chars = sorted(list(hindi_chars))
english_chars = sorted(list(english_chars))

# dictionary to index each time in words character - key is index and value is time in words character
english_words_index_to_char_dict = {}

# dictionary to get time in words character given its index - key is time in words character and value is index
english_words_char_to_index_dict = {}

for k, v in enumerate(english_chars):
    english_words_index_to_char_dict[k] = v
    english_words_char_to_index_dict[v] = k

# dictionary to index each numerals character - key is index and value is numerals character
hindi_words_index_to_char_dict = {}

# dictionary to get numerals character given its index - key is numerals character and value is index
hindi_words_char_to_index_dict = {}
for k, v in enumerate(hindi_chars):
    hindi_words_index_to_char_dict[k] = v
    hindi_words_char_to_index_dict[v] = k

max_len_english_words = max([len(line) for line in english_words])
max_len_hindi_words = max([len(line) for line in hindi_words])

tokenized_english_words = np.zeros(shape=(total_samples, max_len_english_words, len(english_chars)), dtype='float32')
tokenized_hindi_words = np.zeros(shape=(total_samples, max_len_hindi_words, len(hindi_chars)), dtype='float32')
target_data = np.zeros((total_samples, max_len_hindi_words, len(hindi_chars)), dtype='float32')

# Vectorize the time in words and numerals

for i in range(total_samples):
    for k, ch in enumerate(english_words[i]):
        tokenized_english_words[i, k, english_words_char_to_index_dict[ch]] = 1

    for k, ch in enumerate(hindi_words[i]):
        tokenized_hindi_words[i, k, hindi_words_char_to_index_dict[ch]] = 1

        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i, k - 1, hindi_words_char_to_index_dict[ch]] = 1

Encoder Architecture

In [4]:
encoder_input = Input(shape=(None, len(english_chars)))
encoder_LSTM = LSTM(256, return_state=True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM(encoder_input)
encoder_states = [encoder_h, encoder_c]

Inference modelling

In [5]:
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_input = Input(shape=(None, len(hindi_chars)))
decoder_LSTM = LSTM(256, return_sequences=True, return_state=True)
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, initial_state=decoder_input_states)

decoder_states = [decoder_h, decoder_c]
decoder_dense = Dense(len(hindi_chars), activation='softmax')

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, outputs=[decoder_out] + decoder_states)

In [6]:
def decoder(input_seq):
    # Initial states value is coming from the encoder
    states_val = encoder_model_inf.predict(input_seq)

    target_seq = np.zeros((1, 1, len(hindi_chars)))
    target_seq[0, 0, hindi_words_char_to_index_dict['\t']] = 1

    translated_ent = ''
    stop_condition = False

    while not stop_condition:

        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

        max_val_index = np.argmax(decoder_out[0, -1, :])
        sampled_numerals_char = hindi_words_index_to_char_dict[max_val_index]
        translated_ent += sampled_numerals_char

        if (sampled_numerals_char == '\n') or (len(translated_ent) > max_len_hindi_words):
            stop_condition = True

        target_seq = np.zeros((1, 1, len(hindi_chars)))
        target_seq[0, 0, max_val_index] = 1

        states_val = [decoder_h, decoder_c]

    return translated_ent


In [7]:
encoder_model_inf.load_weights("hinglish_hindi/encoder_inference_weight.h5")
decoder_model_inf.load_weights("hinglish_hindi/decoder_inference_weight.h5")

In [8]:
def transliterate(inpt):
    trans = []
    for x in inpt.split(' '):
        array = np.zeros(shape=(max_len_english_words, len(english_chars)), dtype='float32')
        for k, ch in enumerate(x):
            array[k, english_words_char_to_index_dict[ch]] = 1
        array = array[np.newaxis, ...]
        translated_ent = decoder(array)
        trans.append(translated_ent.strip())
    return ' '.join(trans)

In [9]:
transliterate('chalo chalein')

'चलो चलें'

In [10]:
ft_model = fasttext.load_model('/home/pretrained_models/cc.hi.300.bin')




In [11]:
res_dict = {0:'negative', 2:'positive', 1:'neutral'}

In [13]:
sent_model = tf.keras.models.load_model('/home/apna_time/2020-02-13_19_58_01.496519')

In [29]:
def get_sentiment(s):
    s = transliterate(s)
    s = np.array(ft_model.get_sentence_vector(s)[np.newaxis, ...])
    return res_dict[np.argmax(sent_model.predict(s))]

In [30]:
get_sentiment('bohot accha hai')

'positive'