# Formality Classifier
This is going to be used to classify whether a sentence should be included in the informal or formal corpus. This will work by selecting the probability of the sentence belonging to the corpus, and if the score exceeds a threshold it will be included. 

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
import pickle
import json
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Static Variables

In [2]:
BATCH_SIZE = 64
EMBEDDING_DIM = 200

## Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# BASE_PATH = '../../Data'  # on local is path to directory
BASE_PATH = '/content/drive/MyDrive/Data/Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Family_Relationships/S_Formal_FR_train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Family_Relationships/S_Informal_FR_ValTest.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Family_Relationships/S_Formal_FR_train.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Family_Relationships/S_Informal_FR_ValTest.txt'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [8]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

### Preprocess data

In [9]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [10]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

In [11]:
input_corpus = f_corpus.copy()
input_corpus.extend(if_corpus)

input_labels = [True for _ in range(len(f_corpus))]
input_labels.extend([False for _ in range(len(if_corpus))])

In [12]:
holdout_corpus = f_holdout.copy()
holdout_corpus.extend(if_holdout)

holdout_labels = [True for _ in range(len(f_holdout))]
holdout_labels.extend([False for _ in range(len(if_holdout))])

### Tokenize

In [13]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>')
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [14]:
train_set, tokenizer = tokenize(input_corpus)
test_set, _ = tokenize(holdout_corpus, tokenizer)

### Setup TF dataset

In [15]:
buffer_size = len(train_set)
steps_per_epoch = len(train_set) // BATCH_SIZE
vocab_size = len(tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((train_set, input_labels)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((test_set, holdout_labels))
test = test.batch(BATCH_SIZE)

In [16]:
example_input_batch, example_target_batch = next(iter(train))

### Load Embedding Weights

In [17]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [18]:
E = embedding_matrix(tokenizer, vocab_size, EMBEDDING_DIM)

## Declare Model

In [28]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[E], mask_zero=True), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024, return_sequences=True)), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)), 
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="softmax")
])

In [29]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
history = model.fit(train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
model.evaluate(test)



[0.0, 0.9270246028900146]

this probably won't need to be used

Instead of using keras sequential I developed the model through eager execution. It worked out better this way because I could directly control padding mask and loss function, which is crucial for defining the threshold for when to keep newly generated formal and informal sequences. 

In [None]:
class FormalityClassifier(tf.keras.Model):
    def __init__(self):
        super(FormalityClassifier, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[E], mask_zero=True), 
        self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)), 
        self.lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)), 
        self.flatten = tf.keras.layers.Flatten()
        self.ff1 = tf.keras.layers.Dense(64, activation="relu"),
        self.ff2 = tf.keras.layers.Dense(2, activation="softmax")
    
    def call(self, x):
        x = self.embedding(x)
        
        x = self.lstm1(x)
        x = self.lstm2(x)
        
        x = self.flatten(x)
        
        x = self.ff1(x)
        x = self.ff2(x)
        return x


### Optimizer and Loss Function

Here we define the optimizer and the loss function. In our loss function we mask the zeros since that's the padding.

Also of note is in the loss function. The reduction argument at default does some really wonky things which threw off all results. Had to change the reduciton to none, which at default is auto. Not exactly sure what it does in this context but it tries to sum over batches. I didn't work with it because I wanted to control all loss calculation manually. 

In [None]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, preds):
    """Calculate and return loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)