In [None]:
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

In [None]:
model = Sequential()
model.add(SimpleRNN(4, use_bias=False, input_shape=(2,2)))
model.add(Dense(3, use_bias=False, activation='softmax'))
model.summary()

In [None]:
import pandas as pd
import gdown

In [None]:
gdown.download(url="https://drive.google.com/file/d/1ddCfXSHR5LW2zHO52zIuQd9HP3ArNuLA/view?usp=sharing", output="3Gram_love_data.txt", fuzzy=True)

In [None]:
column_names = ['w1', 'w2', 'w3']
trigrams = pd.read_csv('3Gram_love_data.txt', delimiter='\t', names=column_names)
print('shape of the data', trigrams.shape)
print('random sample:\n', trigrams.sample(10))

In [None]:
unique_words = []
for i in list(trigrams.columns.values):
    for j in pd.unique(trigrams[i]):
        unique_words.append(j)
unique_words = np.unique(unique_words)

print('count of unique words:', len(unique_words))
print('unique word list:', unique_words)

In [None]:
word_indices = dict((w, i) for i, w in enumerate(unique_words))
indices_words = dict((i, w) for i, w in enumerate(unique_words))

print("word_indices dictionary\n", word_indices)
print("word_indices.keys\n", word_indices.keys())
print("word_indices.values\n", word_indices.values())
print("\n" + "#"*50)
print("indices_words dictionary\n", indices_words)
print("indices_words.keys\n", indices_words.keys())
print("indices_words.values\n", indices_words.values())

In [None]:
w1_w2 = trigrams[['w1','w2']]
for i in list(w1_w2.columns.values):
    w1_w2[i] = w1_w2[i].map(word_indices)
w1_w2 = np.array(w1_w2)
w1_w2 = np.reshape(w1_w2, (w1_w2.shape[0], 2, 1))
w1_w2_hot = keras.utils.to_categorical(np.array(w1_w2), num_classes=len(word_indices))
print("word1_word2_onehot shape", w1_w2_hot.shape)

In [None]:
w3 = trigrams['w3'].map(word_indices)
w3_hot = keras.utils.to_categorical(np.array(w3), num_classes=len(word_indices))
print("word3_onehot shape is", w3_hot.shape)

In [None]:
print("time steps", w1_w2_hot.shape[1])
print("input nodes", w1_w2_hot.shape[2])
print("output nodes", w3_hot.shape[1])

In [None]:
model_rnn = Sequential()
model_rnn.add(SimpleRNN(30, input_shape=(w1_w2_hot.shape[1], w1_w2_hot.shape[2])))
model_rnn.add(Dense(w3_hot.shape[1], activation='softmax'))
model_rnn.summary()

In [None]:
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(w1_w2_hot, w3_hot, epochs=20)

In [None]:
def rnn_word_pred(in_text):
    print("input is", in_text)
    encoded = [word_indices[i] for i in in_text]
    encoded = np.array(encoded).reshape(1,2,1)
    encoded = keras.utils.to_categorical(np.array(encoded), num_classes=len(word_indices))
    yhat = np.argmax(model_rnn.predict(encoded, verbose=0))
    print("output -->", indices_words[yhat])

In [None]:
rnn_word_pred(['hate','you'])

In [None]:
# download the data file.
gdown.download(url="https://drive.google.com/file/d/1OURGEflZRYGUwCxL-uwgKHBmK6aVQF99/view?usp=drive_link", output="Long_sequence_3gram.csv", fuzzy=True)

In [None]:
longseq = open('Long_sequence_3gram.csv').read().lower()
print(longseq[495:801])
print(longseq[30615:31000])

In [None]:
longseq = longseq.replace(',',' ').replace('\r','')
print(longseq[495:801])
print(longseq[30615:31000])

In [None]:
chars = sorted(list(set(longseq)))
print('unique chars \n', chars)
chars.remove('\n')
print('unique after removing newline \n', chars)
print('overal char count', len(chars))

In [None]:
char_indices = dict((c,i) for i,c in enumerate(chars))
print("chars to indices\n", char_indices)
indices_char = dict((i,c) for i,c, in enumerate(chars))
print("indices to chars\n", indices_char)

In [None]:
data = longseq.splitlines()
# add a white space at the end of every line.
data = [i+' ' for i in data]
# encoding the letters to numbers
sentence = [[char_indices[j] for j in i] for i in data]

In [None]:
print(data[0], sentence[0])
print(data[9000], sentence[9000])
print("#data ", len(data))
print("#sentences ", len(sentence))

In [None]:
seq_len = 14
X = []
y = []
for i in sentence:
    for j in range(len(i)-seq_len):
        X.append(i[j:j+seq_len])
        y.append(i[j+seq_len])
len(X), len(y)

In [None]:
print("data[0:2]=", data[0:2])
print("sentence[0:2]=", sentence[0:2])

In [None]:
for i in range(0,20):
    print("X[", i, "]=", X[i], "y[", i, "]", y[i])

In [None]:
X = np.array(X)
X1 = np.reshape(X, (X.shape[0], X.shape[1], 1))
X1 = keras.utils.to_categorical(np.array(X1), num_classes=len(char_indices))
print(X1.shape)

In [None]:
y1 = np.array(y)
y1 = keras.utils.to_categorical(np.array(y1), num_classes=len(char_indices))
print(y1.shape)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
print(X1[0, 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model_rnn2 = Sequential()
# SimpleRNN(#hidden_nodes, input_shape=(timesteps, data_dim))
model_rnn2.add(SimpleRNN(16,input_shape=(X_train.shape[1], X_train.shape[2])))
model_rnn2.add(Dense(len(char_indices)))
model_rnn2.add(keras.layers.Activation('softmax'))
model_rnn2.summary()

In [None]:
model_rnn2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn2.fit(X_train, y_train, epochs=20, verbose=1, validation_data=(X_test, y_test))

In [None]:
model_rnn2.save_weights("char_rnn_model_weights_v1.weights.h5")

In [None]:
def prepare_input(in_text):
    X1 = np.array([char_indices[i] for i in in_text]).reshape(1, 14, 1)
    X1 = keras.utils.to_categorical(np.array(X1), num_classes=len(char_indices))
    return (X1)

def complete_pred(in_text):
    completion = ' '
    while True:
        x = prepare_input(in_text)
        pred = np.argmax(model_rnn2.predict(x, verbose=0))
        next_char = indices_char[pred]
        in_text = in_text[1:] + next_char
        completion += next_char
        
        if (len(completion) > 20 or next_char == ' '):
            return completion

In [None]:
complete_pred('of particular ')

In [None]:
from keras.layers import LSTM, Activation

In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(128, activation='tanh', input_shape=(X_train.shape[1], X_train.shape[2])))
model_lstm.add(Dense(len(char_indices)))
model_lstm.add(Activation('softmax'))
model_lstm.summary()

In [None]:
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, epochs=20, verbose=1, batch_size=256)
model_lstm.save_weights("char_lstm_model_weights_v2.weights.h5")

In [None]:
def prepare_input1(in_text):
    X1 = np.array([char_indices[i] for i in in_text]).reshape(1, 14, 1)
    X1 = keras.utils.to_categorical(np.array(X1), num_classes=len(char_indices))
    return (X1)

def complete_pred1(in_text):
    completion = ' '
    while True:
        x = prepare_input1(in_text)
        pred = np.argmax(model_lstm.predict(x, verbose=0))
        next_char = indices_char[pred]
        in_text = in_text[1:] + next_char
        completion += next_char
        
        if (len(completion) > 20 or next_char == ' '):
            return completion

In [None]:
complete_pred1("advice is for ")