In [0]:
import io
import string
from pickle import load
from numpy import array

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [6]:
data_path = "hin.txt"
lines = io.open(data_path, encoding = "utf-8").read().split("\n")
lines  = lines[:-1]
lines = [line.split("\t") for line in lines]
print(lines[100])

['I have a dog.', 'मेरे पास एक कुत्ता है।']


In [0]:
import numpy as np
def clean_pairs(lines):
    cleaned = list()
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line.split()
            line = [word.lower() for word in line]
            clean_pair.append(''.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [0]:
l = clean_pairs(lines)
table = str.maketrans('', '', string.punctuation)
l = [[w[0].translate(table), w[1].translate(table)] for w in l]
l = np.array(l)

In [0]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
	return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
	X = tokenizer.texts_to_sequences(lines)
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [10]:
x = []
for i in l[:,0]:
    for j in i.split():
        if j not in x:
            x.append(j)
print(len(x))

2397


In [11]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(l[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(l[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
hindi_tokenizer = create_tokenizer(l[:, 1])
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1
hindi_length = max_length(l[:, 1])
print('Hindi Vocabulary Size: %d' % hindi_vocab_size)
print('Hindi Max Length: %d' % (hindi_length))

English Vocabulary Size: 2398
English Max Length: 22
Hindi Vocabulary Size: 3030
Hindi Max Length: 25


In [0]:
train_length = 2500
trainX = encode_sequences(eng_tokenizer, eng_length, l[:train_length][:, 0])
trainY = encode_sequences(hindi_tokenizer, hindi_length, l[:train_length][:, 1])
trainY = encode_output(trainY, hindi_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, l[train_length:][:, 0])
testY = encode_sequences(hindi_tokenizer, hindi_length, l[train_length:][:, 1])
testY = encode_output(testY, hindi_vocab_size)

In [13]:
print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(2500, 22)
(2500, 25, 3030)
(369, 22)
(369, 25, 3030)


In [14]:
print(trainX[110])
print(l[110])

[25  5 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
['what is this' 'यह क्या है']


In [15]:
print(trainY.shape)
index=5
print(trainY[100][index])
for i,x in enumerate(trainY[100][index]):
    if(x==1):
        print(i,end=" ")
print(l[100])

(2500, 25, 3030)
[1. 0. 0. ... 0. 0. 0.]
0 ['i have a dog' 'मेरे पास एक कुत्ता है।']


In [16]:
# define model
model = define_model(eng_vocab_size, hindi_vocab_size, eng_length, hindi_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
model.fit(trainX, trainY, epochs=200, validation_data=(testX, testY))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 22, 256)           613888    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 25, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 25, 3030)          778710    
Total params: 2,443,222
Trainable params: 2,443,222
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2500 samples, validate on 369 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


<keras.callbacks.callbacks.History at 0x7f36f97653c8>

In [0]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, hindi_tokenizer, source)
		raw_src, raw_target = raw_dataset[i]
		if i < 30:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	# print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	# print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	# print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [18]:
print('train')
evaluate_model(model, hindi_tokenizer, trainX, l[:train_length])
# test on some test sequences
print('test')
evaluate_model(model, hindi_tokenizer, testX, l[train_length:])

train
src=[help], target=[बचाओ], predicted=[बचाओ]
src=[jump], target=[उछलो], predicted=[छलांग]
src=[jump], target=[कूदो], predicted=[छलांग]
src=[jump], target=[छलांग], predicted=[छलांग]
src=[hello], target=[नमस्ते।], predicted=[नमस्ते।]
src=[hello], target=[नमस्कार।], predicted=[नमस्ते।]
src=[cheers], target=[वाहवाह], predicted=[वाहवाह]
src=[cheers], target=[चियर्स], predicted=[वाहवाह]
src=[got it], target=[समझे कि नहीं], predicted=[समझे कि नहीं]
src=[im ok], target=[मैं ठीक हूँ।], predicted=[मैं ठीक हूँ।]
src=[awesome], target=[बहुत बढ़िया], predicted=[बहुत बढ़िया]
src=[come in], target=[अंदर आ जाओ।], predicted=[अंदर आ जाओ।]
src=[get out], target=[बाहर निकल जाओ], predicted=[बाहर निकल जाओ]
src=[go away], target=[चले जाओ], predicted=[चले जाओ]
src=[goodbye], target=[ख़ुदा हाफ़िज़।], predicted=[ख़ुदा हाफ़िज़।]
src=[perfect], target=[उत्तम], predicted=[उत्तम]
src=[perfect], target=[सही], predicted=[उत्तम]
src=[welcome], target=[आपका स्वागत है।], predicted=[आपका स्वागत है।]
src=[welcome], t

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [19]:
!pip install youtube_transcript_api

Collecting youtube_transcript_api
  Downloading https://files.pythonhosted.org/packages/21/81/c4ae5534b113f4938b482f360babbbe6fda550441a4af8e1007dba518586/youtube_transcript_api-0.3.1-py3-none-any.whl
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.3.1


In [0]:
from youtube_transcript_api import YouTubeTranscriptApi
id1='liJVSwOiiwg'
id2='ghYIKh9F5VE'
a=YouTubeTranscriptApi.get_transcript(id2,languages=['en'])

In [50]:
transcript_list = YouTubeTranscriptApi.list_transcripts(idx)

# iterate over all available transcripts
for transcript in transcript_list:

    # the Transcript object provides metadata properties
    print(
        transcript.video_id,
        transcript.language,
        transcript.language_code,
        # whether it has been manually created or generated by YouTube
        transcript.is_generated,
        # whether this transcript can be translated or not
        transcript.is_translatable,
        # a list of languages the transcript can be translated to
        transcript.translation_languages,
    )

    # fetch the actual transcript data
    print(transcript.fetch())

    # translating the transcript will return another transcript object
    print(transcript.translate('en').fetch())

liJVSwOiiwg English (auto-generated) en True True [{'language': 'Afrikaans', 'language_code': 'af'}, {'language': 'Albanian', 'language_code': 'sq'}, {'language': 'Amharic', 'language_code': 'am'}, {'language': 'Arabic', 'language_code': 'ar'}, {'language': 'Armenian', 'language_code': 'hy'}, {'language': 'Azerbaijani', 'language_code': 'az'}, {'language': 'Bangla', 'language_code': 'bn'}, {'language': 'Basque', 'language_code': 'eu'}, {'language': 'Belarusian', 'language_code': 'be'}, {'language': 'Bosnian', 'language_code': 'bs'}, {'language': 'Bulgarian', 'language_code': 'bg'}, {'language': 'Burmese', 'language_code': 'my'}, {'language': 'Catalan', 'language_code': 'ca'}, {'language': 'Cebuano', 'language_code': 'ceb'}, {'language': 'Chinese (Simplified)', 'language_code': 'zh-Hans'}, {'language': 'Chinese (Traditional)', 'language_code': 'zh-Hant'}, {'language': 'Corsican', 'language_code': 'co'}, {'language': 'Croatian', 'language_code': 'hr'}, {'language': 'Czech', 'language_cod

In [51]:
testy = []
for i in a:
    testy.append([i['text'],'Hindi target unkown'])
testy[0]

['Happy Thanksgiving', 'Hindi target unkown']

In [0]:
testX = encode_sequences(eng_tokenizer, eng_length, np.array(testy)[:,0])

In [53]:
evaluate_model(model, hindi_tokenizer, testX, testy)

src=[Happy Thanksgiving], target=[Hindi target unkown], predicted=[समझे हो]
src=[thanks God will I'm so glad that you], target=[Hindi target unkown], predicted=[बाएं आपकी तुम्हारे तुम्हारे खुश कैफ़े]
src=[came you just look great], target=[Hindi target unkown], predicted=[तुम किताब गया दिन आता]
src=[mr. loss like 150 pounds yeah I'm gonna], target=[Hindi target unkown], predicted=[मुझे बस आ करते है।]
src=[be one the Subway sandwich commercials], target=[Hindi target unkown], predicted=[मैं औरडर चला चला हूँ।]
src=[okay alright alright snow fat no sugar], target=[Hindi target unkown], predicted=[मुझे जल्दी नहीं है।]
src=[it's no dairy it's no good throw it out], target=[Hindi target unkown], predicted=[नहीं नहीं में नहीं नहीं करता।]
src=[you're gonna meet some people says that], target=[Hindi target unkown], predicted=[इस उसकी हमारे सारे जो नहीं नहीं]
src=[this is my husband Chandler Chandler], target=[Hindi target unkown], predicted=[यह मेरा पति है।]
src=[this is will oh hey I'd shake y

In [43]:

predict_sequence(model, hindi_tokenizer, testX[0].reshape((1, testX[2].shape[0])))

''