# Formality Classifier
This is going to be used to classify whether a sentence should be included in the informal or formal corpus. This will work by selecting the probability of the sentence belonging to the corpus, and if the score exceeds a threshold it will be included. 

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
import pickle
import json
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Static Variables

In [2]:
BATCH_SIZE = 64
EMBEDDING_DIM = 200

## Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# BASE_PATH = '../../Data'  # on local is path to directory
BASE_PATH = '/content/drive/MyDrive/Data/Data'

# FORMAL_PATH_TRAIN = '{}/Supervised Data/Family_Relationships/S_Formal_FR_train.txt'.format(BASE_PATH)
# INFORMAL_PATH_TRAIN = '{}/Supervised Data/Family_Relationships/S_Informal_FR_train.txt'.format(BASE_PATH)

# FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Family_Relationships/S_Formal_FR_ValTest.txt'.format(BASE_PATH)
# INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Family_Relationships/S_Informal_FR_ValTest.txt'.format(BASE_PATH)

FORMAL_PATH = BASE_PATH + '/GYAFC_Corpus/Family_Relationships/train/formal'
INFORMAL_PATH = BASE_PATH + '/GYAFC_Corpus/Family_Relationships/train/informal'

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [5]:
formal = open(FORMAL_PATH).read()
informal = open(INFORMAL_PATH).read()

### Preprocess data

In [6]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [7]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

print("There are {} sequences in total".format(len(f_corpus)))

f_val = f_corpus[:2000]
if_val = if_corpus[:2000]

if_holdout = if_corpus[2000:4000]
f_holdout = f_corpus[2000:4000]

f_corpus = f_corpus[4000:]
if_corpus = if_corpus[4000:]

print('Training on {} sequences'.format(len(f_corpus)))

There are 51968 sequences in total
Training on 47968 sequences


In [8]:
def split_corpora(formal, informal):
    corpus = formal.copy()
    corpus.extend(informal)

    corpus_labels = [True for _ in range(len(formal))]
    corpus_labels.extend([False for _ in range(len(informal))])

    return corpus, corpus_labels

In [9]:
input_corpus, input_labels = split_corpora(f_corpus, if_corpus)
holdout_corpus, holdout_labels = split_corpora(f_holdout, if_holdout)
val_corpus, val_labels = split_corpora(f_val, if_val)

### Tokenize

In [10]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>')
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [11]:
train_set, tokenizer = tokenize(input_corpus)
val_set, _ = tokenize(val_corpus, tokenizer)
test_set, _ = tokenize(holdout_corpus, tokenizer)

### Setup TF dataset

In [12]:
buffer_size = len(train_set)
steps_per_epoch = len(train_set) // BATCH_SIZE
vocab_size = len(tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((train_set, input_labels)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((test_set, holdout_labels))
test = test.batch(BATCH_SIZE)

val = tf.data.Dataset.from_tensor_slices((val_set, val_labels))
val = val.batch(BATCH_SIZE)

In [13]:
example_input_batch, example_target_batch = next(iter(train))

### Load Embedding Weights

In [14]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [15]:
E = embedding_matrix(tokenizer, vocab_size, EMBEDDING_DIM)

## Declare Model

In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[E], mask_zero=True),
    tf.keras.layers.Dropout(0.8),  
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)),
    tf.keras.layers.Dropout(0.8), 
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [20]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
history = model.fit(train, validation_data=val, epochs=2)

Epoch 1/2
Epoch 2/2


In [24]:
model.evaluate(test)



[0.3723843991756439, 0.8297500014305115]

Going to find the following data set 
$$ T_{avg} = \{(s_i, s_i^\prime)|P_+(s_i^\prime) - P_+(s_i) > \sigma \}$$

## Import Baseline Data

In [26]:
BASELINE_PATH = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)
ROUND_TRIP_PATH = '{}/Supervised Data/FD Data/informal_rt.txt'.format(BASE_PATH)
SAVE_PATH = '{}/Supervised Data/FD Data/discriminated_seqs.txt'.format(BASE_PATH)

with open(BASELINE_PATH) as f:
    if_raw = [process_sequence(seq) for seq in f.read().split('\n')]

with open(ROUND_TRIP_PATH) as f:
    if_rt = [process_sequence(seq) for seq in f.read().split('\n')]

if_rt = if_rt[:-1]  # blank line at end of file 

In [27]:
assert len(if_rt) == len(if_raw)

### Tokenize and Make Dataset

In [28]:
baseline_set, _ = tokenize(if_raw, tokenizer)
rt_set, _ = tokenize(if_rt, tokenizer)

In [29]:
def get_Tavg(raw, rt, sigma=0.6):
    """
    Take in raw informal sequences and round trip translations 
    """
    raw_pred, rt_pred = model.predict(raw), model.predict(rt)
    diff = rt_pred - raw_pred
    tavg = np.where(diff[:, 1] > sigma)
    return tavg[0]

In [30]:
tavg = get_Tavg(baseline_set, rt_set)

Going to save the addional files 

In [33]:
with open(ROUND_TRIP_PATH) as f:
    temp_rt = [seq for seq in f.read().split('\n')]

In [37]:
additional_seqs = np.array(temp_rt)[tavg]

In [42]:
print("Expanded dataset by {:2f}%".format(len(additional_seqs)/25000))

Expanded dataset by 0.044040%


In [44]:
with open(SAVE_PATH, 'w') as f:
    for seq in additional_seqs:
        f.write(seq + '\n')

In [49]:
model.save_weights(BASE_PATH + 'formality_classifier')