# Captal Letters Recognition

In [None]:
import os

import nltk
nltk.download('semcor')
nltk.download('punkt')
nltk.download('perluniprops')
from nltk.corpus import semcor
from nltk.tokenize.moses import MosesDetokenizer
from keras.layers import Embedding, LSTM, GRU, Conv1D, Dense, Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np

# detokenization: turnig tokens back into sentences
MDETOK = MosesDetokenizer()

### Load Brown / Semcor Corpus, select sentences with lots of capitalized words

In [None]:
def filter_titled_sents(sentences, u_case_min_nb=3):
    '''select only those sentences with at least u_case_min_nb number of words beginning with capital letter'''
    filtered_sents = []
    for sent in sentences:
        nb_titles = 0
        for token in sent:
            if token.istitle():
                nb_titles += 1
        if nb_titles >= u_case_min_nb:
            filtered_sents.append(sent)
    return filtered_sents

In [None]:
sents = semcor.sents()  # loading tokenized sentences from Semcor corpus
print("number of sentences: %s" % len(sents))
sents = filter_titled_sents(sents, u_case_min_nb=3)
print("number of sentences after filtering: %s" % len(sents))
print("sample sentence:")
print(sents[10000])

In [None]:
# we are going to keep only shorter sentences
MAX_SEQUENCE_LENGTH = 35
sents = [sent for sent in sents if len(sent) <= MAX_SEQUENCE_LENGTH]
print("number of sentences after filtering: %s" % len(sents))

# and clean the dataset a bit removing tokens like `` first
filter_out_toks = {'``', '\'\''}
sents =[[word for word in sent if word not in filter_out_toks] for sent in sents]

## Word Level Models

In this task we are going to use pre-trained "GloVe" word embeddings that can be downloaded from https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
# make sentences lowercase
sents_lower =[[word.lower() for word in sent] for sent in sents]
# annotate words in sentences based on their first letter case
capitalization_sent_tags = [[word.istitle() for word in sent] for sent in sents]
print("tokens anotated based on their first letter case:")
print(zip(sents_lower[-500], capitalization_sent_tags[-500]))

In [None]:
# create a vocabulary of all words in our dataset
words = set([])
for sent in sents_lower:
    words.update(sent)
print("vocabulary size: %s" % len(words))

In [None]:
# create a dictionary, an index for each word
dictionary = dict()
for i, word in enumerate(words):
    dictionary[word] = i
print("index of `hello`: %s" % dictionary["hello"])

# a mapping for indexes back into words
idx2word = {}
for word, i in dictionary.items():
    idx2word[i] = word

In [None]:
# convert sentences into sequences of word indexes
sequences = [[dictionary[word] for word in sent] for sent in sents_lower]
print("sequence of word indexes for each sentence: %s" % sequences[0][:10])

In [None]:
# pad sequences with zeros to make them same length: we need it for vectorized computations
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
data[0]

In [None]:
# labels will be converted to categories: first indicates the probability of a capitalized word, second a lowercased word 
labels = pad_sequences(capitalization_sent_tags, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = to_categorical(labels)
labels[0][:10]

In [None]:
# our dataset will be split into a traing part and a validation part,
# where we measure our model's performance during training, we will further keep a testing part to evaluate predictions 
TEST_SPLIT = .1
nb_test_samples = int(TEST_SPLIT * data.shape[0])
print("number of test samples: %s" % nb_test_samples)

In [None]:
# we want to shuffle the data a bit to split the dataset uniformly
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:-nb_test_samples]
y_train = labels[:-nb_test_samples]
x_test = data[-nb_test_samples:]
y_test = labels[-nb_test_samples:]

In [None]:
# let's create a dictionary of each word in the pre-trained GloVe embeddings, saving its location indexes 
GLOVE_DIR = "../glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print("embedding for the word `word`:")
print(embeddings_index.get("word"))

In [None]:
# let's try to extract the GloVe embeddings for each word from our vocabulary
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(dictionary) + 1, EMBEDDING_DIM))
for word, i in dictionary.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Checking how many words have no pre-trained GloVe word embeddings:

In [None]:
oov_percentage = 100. * np.count_nonzero(np.all(embedding_matrix == 0, axis=1)) / len(dictionary)  # OOV portion
print("percentage of words out of vocabulary: %s percent" % oov_percentage)
outta_vocab_idxs = set(np.where(np.all(embedding_matrix == 0, axis=1))[0])
outta_vocab_words = [word for word, i in dictionary.items() if i in outta_vocab_idxs]
print("examples of words without pre-trained GloVe embeddings:")
print(outta_vocab_words[:15])

In [None]:
def print_predictions(x_test, y_pred, idx2word):
    for seq, preds in zip(x_test, y_pred):
        sentence = []
        for word_id, pred in zip(seq, preds):
            if pred[0] > pred[1]:
                sentence.append(idx2word[word_id])
            else:
                sentence.append(idx2word[word_id].capitalize())
        print(MDETOK.detokenize(sentence, return_str=True).strip() + "\n")

### RNN Model

In [None]:
HIDDEN_SIZE_LSTM = EMBEDDING_DIM
BATCH_SIZE = 32
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Bidirectional(LSTM(HIDDEN_SIZE_LSTM, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

In [None]:
y_pred = model.predict(x_test)
print_predictions(x_test, y_pred, idx2word)

### CNN Model

In [None]:
BATCH_SIZE = 32
WINDOW_SIZES = [3, 3]
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Conv1D(filters=EMBEDDING_DIM, kernel_size=WINDOW_SIZES[0], activation='relu', padding='causal'))
model.add(Conv1D(filters=EMBEDDING_DIM, kernel_size=WINDOW_SIZES[1], activation='relu', padding='same'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

In [None]:
y_pred = model.predict(x_test)
print_predictions(x_test, y_pred, idx2word)

### Simple baseline: A single fully connected layer

In [None]:
BATCH_SIZE = 32
WINDOW_SIZES = [3, 3]
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=30, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

In [None]:
y_pred = model.predict(x_test)
print_predictions(x_test, y_pred, idx2word)

## Character Level Models

In [None]:
HIDDEN_SIZE_LSTM = 100
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30

In [None]:
# our dataset will be split into a traing part and a validation part, where we measure our model's performance
VALIDATION_SPLIT = .2
nb_validation_samples = int(VALIDATION_SPLIT * len(sents))
indices = np.arange(len(sents))
np.random.shuffle(indices)
train_indices = indices[:-nb_validation_samples]
val_indices = indices[-nb_validation_samples:]


whole_sents_train = [MDETOK.detokenize(sents[index], return_str=True) for index in train_indices]
whole_sents_val = [MDETOK.detokenize(sents[index], return_str=True) for index in val_indices]
whole_sents_lower_train = [sent.lower() for sent in whole_sents_train]
whole_sents_lower_val = [sent.lower() for sent in whole_sents_val]

whole_text_train = " ".join(whole_sents_train)
whole_text_val = " ".join(whole_sents_val)
whole_text_lower_train = whole_text_train.lower()
whole_text_lower_val = whole_text_val.lower()

# create a vocabulary
chars_vocab = set(whole_text_lower_train).union(whole_text_lower_val)
print("vocabulary size: %s" % len(chars_vocab))

# create a dictionary, an index for each character
chars_dictionary = dict()
for i, char in enumerate(chars_vocab):
    chars_dictionary[char] = i

# a mapping for indexes back into chars
idx2char = {}
for char, i in chars_dictionary.items():
    idx2char[i] = char

### Looking at the whole sentence characters in a single prediction

In [None]:
capitalization_char_tags_train = [[char.isupper() for char in sent] for sent in whole_sents_train]
capitalization_char_tags_val = [[char.isupper() for char in sent] for sent in whole_sents_val]

# convert sentences into sequences of character indexes
sequences_train = [[chars_dictionary[char] for char in sent] for sent in whole_sents_lower_train]
sequences_val = [[chars_dictionary[char] for char in sent] for sent in whole_sents_lower_val]

In [None]:
MAX_SEQUENCE_LENGTH = 200

data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])
data_val = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])

labels_train = pad_sequences(capitalization_char_tags_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])[:,:,np.newaxis]
labels_val = pad_sequences(capitalization_char_tags_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])[:,:,np.newaxis]

#### RNN Model

In [None]:
HIDDEN_SIZE_LSTM = EMBEDDING_DIM
BATCH_SIZE = 32
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model.add(Bidirectional(LSTM(HIDDEN_SIZE_LSTM, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data_train, labels_train, validation_data=(data_val, labels_val), epochs=10, batch_size=BATCH_SIZE)

In [None]:
def print_char_predictions(x_test, y_pred, idx2char):
    for seq, preds in zip(x_test, y_pred):
        sentence = []
        for char_id, pred in zip(seq, preds):
            if pred > .5:
                sentence.append(idx2char[char_id].upper())
            else:
                sentence.append(idx2char[char_id])
        print("".join(sentence) + "\n")

In [None]:
y_pred = model.predict(data_val)
print_char_predictions(data_val, y_pred, idx2char)

### Using a smaller sliding window on a continuous text

In [None]:
capitalization_char_tags_train = [char.isupper() for char in whole_text_train]
capitalization_char_tags_val = [char.isupper() for char in whole_text_val]

# convert sentences into sequences of character indexes
sequence_train = [chars_dictionary[char] for char in whole_text_lower_train]
sequence_val = [chars_dictionary[char] for char in whole_text_lower_val]

In [None]:
def window_batch_generator(sequence, labels, win_size=30, batch_size=32):
    x_batch, y_batch = [], []
    while True:
        for i in range(0, len(sequence) - win_size):
            if len(x_batch) == batch_size:
                yield np.array(x_batch), np.array(y_batch, dtype="float64")[:,:,np.newaxis]
                x_batch, y_batch = [], []
            x_batch.append(sequence[i:i + win_size])
            y_batch.append(labels[i:i + win_size])
        if len(x_batch) != 0:
            yield np.array(x_batch), np.array(y_batch, dtype="float64")[:,:,np.newaxis]

#### RNN Model

In [None]:
data_gen_train = window_batch_generator(sequence_train, capitalization_char_tags_train)
data_gen_val = window_batch_generator(sequence_val, capitalization_char_tags_val)

In [None]:
steps_per_epoch_train = (len(sequence_train) - WIN_SIZE) / BATCH_SIZE
steps_per_epoch_val = (len(sequence_val) - WIN_SIZE) / BATCH_SIZE

In [None]:
HIDDEN_SIZE_GRU = 10
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30
NB_EPOCHS = 1
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=WIN_SIZE, trainable=True))
model.add(Bidirectional(GRU(HIDDEN_SIZE_GRU, return_sequences=True), input_shape=(WIN_SIZE, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(data_gen_train, validation_data=data_gen_val,
                    epochs=NB_EPOCHS, steps_per_epoch=steps_per_epoch_train, validation_steps=steps_per_epoch_val)

*We can take each overlapping window prediction and extract only its middle part to make sure we cover the neighboring characters from both sides.*

In [None]:
def print_mid_window_predictions(batch_samples, model, idx2char):
    win_size = test_samples[0].shape[1]
    mid_win_idx = win_size / 2
    extacted_text = []
    for batch in test_samples:
        predictions = model.predict_on_batch(batch)
        for chars, preds in zip(batch, predictions):
            if preds[mid_win_idx] > .5:
                extacted_text.append(idx2char[chars[mid_win_idx]].upper())
            else:
                extacted_text.append(idx2char[chars[mid_win_idx]])
    print("".join(extacted_text))

In [None]:
steps = 100
test_samples = [data_gen_val.next()[0] for _ in range(steps)]

In [None]:
print_mid_window_predictions(test_samples, model, idx2char)

####  Predicting the first letter only by learning to read backwards :D...we won't be able to recognize the beginnings of sentences, but we might succeed catching some entities, lets give it a try:

In [None]:
def window_batch_generator_first_letter_out(sequence, labels, win_size=30, batch_size=32):
    x_batch, y_batch = [], []
    while True:
        for i in range(0, len(sequence) - win_size):
            if len(x_batch) == batch_size:
                yield np.array(x_batch), np.array(y_batch, dtype="float64")[:, np.newaxis]
                x_batch, y_batch = [], []
            x_batch.append(sequence[i:i + win_size])
            y_batch.append(labels[i])
        if len(x_batch) != 0:
            yield np.array(x_batch), np.array(y_batch, dtype="float64")[:, np.newaxis]

In [None]:
data_gen_train = window_batch_generator_first_letter_out(sequence_train, capitalization_char_tags_train)
data_gen_val = window_batch_generator_first_letter_out(sequence_val, capitalization_char_tags_val)

In [None]:
HIDDEN_SIZE_GRU = 10
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30
NB_EPOCHS = 3
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=WIN_SIZE, trainable=True))
model.add(GRU(HIDDEN_SIZE_GRU, return_sequences=False, go_backwards=True, input_shape=(WIN_SIZE, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(data_gen_train, validation_data=data_gen_val,
                    epochs=NB_EPOCHS, steps_per_epoch=steps_per_epoch_train, validation_steps=steps_per_epoch_val)

In [None]:
def print_first_letter_predictions(batch_samples, model, idx2char):
    extacted_text = []
    for batch in test_samples:
        predictions = model.predict_on_batch(batch)
        for chars, pred in zip(batch, predictions):
            if pred > .5:
                extacted_text.append(idx2char[chars[0]].upper())
            else:
                extacted_text.append(idx2char[chars[0]])
    print("".join(extacted_text))

In [None]:
steps = 100
test_samples = [data_gen_val.next()[0] for _ in range(steps)]
print_first_letter_predictions(test_samples, model, idx2char)