# Installing necessary libraries

In [None]:
!pip install fasttext

In [None]:
!pip install urduhack[tf]

# Importing modules and mounting drive

In [None]:
import ast
import keras
import pickle
import fasttext
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import models
from keras import layers
from keras.utils import plot_model
from google.colab import drive
from urduhack.normalization import normalize_characters

In [None]:
import re
import io
import ast
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Embedding, GRU, Dropout, Bidirectional, SpatialDropout1D,TimeDistributed,LSTM

In [None]:
drive.mount("/content/drive/", force_remount=True)

Mounted at /content/drive/


# CNN Model Class

In [None]:
class CNN_subword_embeddings:

  def __init__(self):
    self.max_len = 34
    self.ft = fasttext.load_model('/content/drive/MyDrive/wiki.ur.bin')

  def preprocessing(self,word):
    return normalize_characters(u'{}'.format(word)).replace('\u200c','').replace(' ','')

  def get_fasttext_embeddings(self, word):
    subwords = self.ft.get_subwords(word)[0]
    embedding = [self.ft.get_word_vector(subword) for subword in subwords]  # getting embedding vectors from fasttext
    return embedding

  def pad_seq(self,fasttext):
    if len(fasttext) < self.max_len:
        fasttext+=[[0]*300]*(self.max_len - len(fasttext))
    return fasttext

  def get_model(self):
  #m : number of words in a sentence
    n = 150                                                   # number of filters
    k = (1,2,3,4,5,6,7)                                       # kernel size of filters
    emb_dim = 300                                             # embedding dimension
    model = models.Sequential()
    conv_layers = []                                          # different layers to run in parallel for different filter sizes
    input_shape = layers.Input(shape=(self.max_len, emb_dim))
    for kw in k:
      c = layers.Conv1D(n, kw, activation="relu")(input_shape)
      conv_layers.append(layers.MaxPool1D(pool_size=self.max_len - kw +1)(c))
    merged = layers.concatenate(conv_layers,axis=1)
    merged = layers.Flatten()(merged)
    out = layers.Dense(300)(merged)
    model = models.Model(input_shape, out)
    return model

  def get_embedding(self,word):
    word = self.preprocessing(word)
    fasttext = self.get_fasttext_embeddings(word)
    fasttext = self.pad_seq(fasttext)

    x = np.stack([i for i in fasttext])
    x = x.reshape(1,x.shape[0],x.shape[1])

    model = self.get_model()
    model.compile(optimizer=keras.optimizers.Adam(), loss='mean_squared_error', metrics=[keras.metrics.Accuracy()])

    return tf.convert_to_tensor(model.predict(x)[0])


In [None]:
cnn = CNN_subword_embeddings()
word_ = cnn.get_embedding('شاداب')





In [None]:
len(word_)

300

# Phonological Embeddings class

In [None]:
class Phonological_embeddings:

  def __init__(self):
    self.splits=['e', 'o', 'a', 'i', 'u' ];
    self.splits_b=['aa','ii','uu'];
    self.splits_c=['e', 'o', 'i', 'u' ];
    self.consonants=['ch','kh','sh','gh','bh','ph','th','jh','Th','chh','dh','ddh','rhh','b','p','t','j','s','d','z','r','f','k','g','l','m','n','h','w']
    self.wordDict = pickle.load(open('/content/drive/MyDrive/word_dict.csv','rb'))
  def get_subwords(self,token):
    a = []
    j = 0

    if len(token) <= 3:
      a.append([token])

    elif len(token) == 4:
      for i in range (0, len(token)-1):
        if(token[i] in self.consonants and token[i+1] in self.consonants):
          if(len(token[j:i+1])>1):
              a.append([token[j:i+1]])
              j=i+1

        elif(token[i] in self.splits and token[i+1] in self.consonants):
          if(len(token[j:i+1])>1):
              a.append([token[j:i+1]])
              j=i+1

        else:
          a.append([token[j:len(token)]])
          j=i+2

    else:
      for i in range(0, len(token)):
        if token[i:i+3] in self.consonants and i+3 < len(token):
          if i+3 == len(token)-1:
            if(len(token[j:i+4])>1):
              a.append([token[j:i+4]])
              j=i+4

          else:
            if(len(token[j:i+5])>1):
              a.append([token[j:i+5]])
              j=i+5

        elif token[i:i+2] in self.splits_b and i+2 < len(token):
          if token[i+2] in self.consonants and i+2 == len(token)-1:
            if(len(token[j:i+3])>1):
              a.append([token[j:i+3]])
              j=i+3

          else:
            if(len(token[j:i+2])>1):
              a.append([token[j:i+2]])
              j=i+2

        elif token[i] in self.splits:
            if token[i+1:i+4] in self.consonants and i+5 == len(token):
              if(len(token[j:i+1])>1):
                a.append([token[j:i+1]])
                j=i+1

            elif token[i+1:i+3] in self.consonants:
              if(len(token[j:i+3])>1):
                a.append([token[j:i+3]])
                j=i+3

            elif token[i+1:i+2] in self.consonants and token[i+2:i+3] in self.consonants:
              if(len(token[j:i+1])>1):
                a.append([token[j:i+2]])
                j=i+2

            elif i+2 == len(token):
              if(len(token[j:i+2])>1):
                a.append([token[j:i+2]])
                j=i+2

            else:
              if(len(token[j:i+1])>1):
                a.append([token[j:i+1]])
                j=i+1

    a = [_ for __ in a for _ in __]
    return a

  def calculate_prosody(self,token):
    n = []
    b = []

    for i in range(0, len(token)):
      if len(token[i]) >= 3:
        n.append(3)
        b.append(110)

      elif len(token[i]) == 1:
        n.append(2)
        b.append(10)

      else:
        if token[i][0:2] in self.splits_b:
          n.append(3)
          b.append(110)

        elif ((token[i][0:1] in self.splits and token[i][1:2] in self.consonants) or (token[i][1:2] in self.splits and token[i][0:1] in self.consonants)) and i < len(token)-1:
          n.append(3)
          b.append(110)

        else:
          n.append(2)
          b.append(10)

    return [n, b]

  def get_datavlues(self,token):
    arr = []
    num_pros = []
    bin_pros = []

    arr.append(self.get_subwords(token))
    num_pros.append(self.calculate_prosody(arr[-1])[0])
    bin_pros.append(self.calculate_prosody(arr[-1])[1])

    arr = [e for sublist in arr for e in sublist]
    num_pros = [e for sublist in num_pros for e in sublist]
    bin_pros = [e for sublist in bin_pros for e in sublist]


    return arr,num_pros,bin_pros

  def encode_word(self,word):

    token,num_pros,bin_pros=self.get_datavlues(word)
    a = []

    for t in token:
      for j in range (0, len(self.wordDict)):
        if t == self.wordDict[j][1]:
          a.append(self.wordDict[j][0])

    #padding
    maxlen=5

    if len(a)<maxlen:
      a+=([list(self.wordDict)[-1]]*(maxlen-len(a)))

    if len(num_pros) < maxlen:
        num_pros += [0]*(maxlen-len(num_pros))
    return a,num_pros

  def return_model(self):
    # EMBEDDING_DIM = 300
    # vocab_size = list(self.wordDict.keys())[-1] + 2
    # model = Sequential()

    # model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=5))
    # model.add(Bidirectional(GRU(256, return_sequences=True)))

    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(4, activation='softmax')))

    # model.compile(loss='categorical_crossentropy' , metrics=[keras.metrics.accuracy], optimizer=keras.optimizers.Adam())
    model= phonetic = pickle.load(open('/content/drive/MyDrive/model.h5','rb'))

    return model

  def get_embedding(self,word):
    subwords,num_pros=self.encode_word(word)
    model=self.return_model()
    x_train = np.array(subwords).astype(np.float32)
    intermediate_layer_model = keras.Model(inputs=model.input,
                                       outputs=model.layers[0].output)
    intermediate_output = intermediate_layer_model(x_train)
    return intermediate_output


# B/F LM CLass

In [None]:
label_dict = pickle.load(open('/content/drive/MyDrive/labels_dict.sav','rb'))
key = [key for key, val in label_dict.items() if val=='و']
key

[3170]

In [None]:
# bflm = BFLM()

In [None]:
class BFLM:
  def __init__(self):
    self.max_len = 16
    self.backward_model = tf.keras.models.load_model('/content/drive/MyDrive/backward_model130.h5')
    self.forward_model = tf.keras.models.load_model('/content/drive/MyDrive/forward_model2.h5')
    self.label_dict = pickle.load(open('/content/drive/MyDrive/labels_dict.sav','rb'))
    # self.ytrain_labels = pickle.load(open('/content/drive/MyDrive/y_train_labels_dict.sav','rb'))

  def pad_sequence(self, word_seq):
    if len(word_seq)<self.max_len:
      word_seq += [[0]*600]*(self.max_len - len(word_seq))
      word_seq = [np.array(w) for w in word_seq]
      return word_seq

  def predict(self, word_seq, backward=False):
    """Expecting (len of seq, 600) word seq vector"""
    # print(word_seq)
    if backward:
      model = self.backward_model
    else:
      model = self.forward_model
    word_seq = self.pad_sequence(word_seq)
    word_seq = np.array(word_seq)
    # print(word_seq)
    # print(word_seq.shape)
    x = word_seq.reshape(1,word_seq.shape[0],word_seq.shape[1])
    res = np.argmax(model.predict(x))
    return self.label_dict[res]

# Drivers Code

In [None]:
def add_subwords_together(phonological_embedding):
  return phonological_embeddings[0]+phonological_embeddings[1]+phonological_embeddings[2]+phonological_embeddings[3]+phonological_embeddings[4]

In [None]:
urdu_to_roman = pickle.load(open('/content/drive/MyDrive/Urdu_to_roman.sav','rb'))


In [None]:
bflm = BFLM()

In [None]:
# word = ['ہم', 'نشیں', 'مت', 'کہہ', 'کہ', 'برہم', 'کر', 'نہ', 'بزم', 'عیش', 'دوست', 'تو', 'میرے','غیر', 'ہے', 'قطع', 'لباس', 'خانہ','راز', 'نالہ', 'ہوں', 'کہ', 'بہ', 'شرح', 'نگاۂ','فہم', 'زنجیری', 'بے', 'ربطی', 'دل', 'نالے', 'کو', 'بھی', 'اعتبار', 'نغمہ', 'ہے']
word = ['دوست']
verses=[]
for seed_word in word:
  roman_word = urdu_to_roman[seed_word]
  cnn = CNN_subword_embeddings()
  cnn_embeddings = cnn.get_embedding(seed_word)

  phn = Phonological_embeddings()
  phonological_embeddings = phn.get_embedding(roman_word) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
  phonological_embeddings = add_subwords_together(phonological_embeddings)
  concat_vector = [tf.concat([cnn_embeddings,phonological_embeddings], axis=0).numpy().tolist()]
  first_verse = [seed_word]

  # Getting previous words
  i = 0
  while(True):
    new_word = bflm.predict(list(concat_vector), backward=True)
    i+=1
    if new_word == '<SOS>' or i==4:
      break
    else:
      first_verse.append(new_word)
      cnn_embeddings = cnn.get_embedding(new_word)
      phonological_embeddings = phn.get_embedding(urdu_to_roman[new_word]) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
      phonological_embeddings = add_subwords_together(phonological_embeddings)
      concat_vector.append(tf.concat([cnn_embeddings,phonological_embeddings], axis=0).numpy().tolist())
  # At this point the concat_vector will contain the sentence from start to seed word in reverse order.
  concat_vector.reverse()
  first_verse.reverse()
  # Getting next words
  i=0
  while(True):
    new_word = bflm.predict(list(concat_vector))
    i+=1
    if new_word == '<EOS>' or i == 4:
      break
    else:
      first_verse.append(new_word)
      cnn_embeddings = cnn.get_embedding(new_word)
      phonological_embeddings = phn.get_embedding(urdu_to_roman[new_word]) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
      phonological_embeddings = add_subwords_together(phonological_embeddings)
      concat_vector.append(tf.concat([cnn_embeddings,phonological_embeddings], axis=0))


  verses.append(first_verse)

for sub_words in range(len(word)):
  print(word[sub_words])
  print(verses[sub_words])




دوست
['کلاہیں', 'ہشیاری', 'دل', 'دوست', 'کرتے', 'بناؤ', 'تنگ']


In [None]:
phn = Phonological_embeddings()
phonological_embeddings = phn.get_embedding(roman_word) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
phonological_embeddings = add_subwords_together(phonological_embeddings)
concat_vector = [tf.concat([cnn_embeddings,phonological_embeddings], axis=0).numpy().tolist()]
first_verse = [seed_word]

In [None]:
print(type(concat_vector[0]))

<class 'list'>


In [None]:
# Getting previous words
i = 0
while(True):
  new_word = bflm.predict(list(concat_vector), backward=True)
  i+=1
  if new_word == '<SOS>' or i==5:
    break
  else:
    first_verse.append(new_word)
    cnn_embeddings = cnn.get_embedding(new_word)
    phonological_embeddings = phn.get_embedding(urdu_to_roman[new_word]) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
    phonological_embeddings = add_subwords_together(phonological_embeddings)
    concat_vector.append(tf.concat([cnn_embeddings,phonological_embeddings], axis=0).numpy().tolist())
# At this point the concat_vector will contain the sentence from start to seed word in reverse order.
concat_vector.reverse()
first_verse.reverse()
# Getting next words
i=0
while(True):
  new_word = bflm.predict(list(concat_vector))
  i+=1
  if new_word == '<EOS>' or i == 5:
    break
  else:
    first_verse.append(new_word)
    cnn_embeddings = cnn.get_embedding(new_word)
    phonological_embeddings = phn.get_embedding(urdu_to_roman[new_word]) # Expecting this fn to be part of the class and should return embeddings in (5,300) form
    phonological_embeddings = add_subwords_together(phonological_embeddings)
    concat_vector.append(tf.concat([cnn_embeddings,phonological_embeddings], axis=0))





نشہ

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

کمال

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

دِل

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

زخم

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

In [None]:
first_verse

['کے', 'مرے', 'میں', 'تر', 'دوست', 'شرر', 'دیکھنے', 'جانا', 'دیکھ']

In [None]:
' '.join(first_verse)

'کے مرے میں تر دوست شرر دیکھنے جانا دیکھ'