In [None]:
#Necessary import of libraries

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.optimizers import Adam
from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.models import Model
from scipy import spatial
from keras import backend
import argparse
import pandas as pd
import numpy as np
import codecs
import csv
import json
import ast
import random

In [None]:
#Preprocessing datasets for simpler input format

def ExtractData(datasetName):
    """
    Returns training and testing dataset from dataset name ('gab, 'reddit', 'conan')
    """
    #Training data percent
    pct = 0.80
    if datasetName in ['reddit', 'gab']:
        dataFile = open ('../input/gab-data/' + datasetName + '.csv', 'r', encoding='utf-8')
        hateSpeechBlob = []
        hsIdx = []
        responseBlob = []
        
        reader = csv.DictReader(dataFile)
        for row in reader:
            x = row['text']
            y = row['hate_speech_idx']
            z = row['response']
            if y == 'n/a':
                continue
            hateSpeechBlob.append(x)
            hsIdx.append(y)
            responseBlob.append(z)
        
        hateCount = 0
        for item in hsIdx:
            for i in item.strip('[]').split(', '):
                hateCount += 1

        hateSpeech, counterSpeech = [], []
        lineNumber = 0
        for hs, idx, cs in zip(hateSpeechBlob, hsIdx, responseBlob):
            hs = hs.strip().split('\n')
            for i in idx.strip('[]').split(', '):
                try:
                    hateSpeech.append('. '.join(hs[int(i) - 1].split('. ')[1:]).strip('\t'))
                except:
                    continue
                    #Note this is because there is an error in the data that throws out of bounds
                temp = []
                for j in splitResponse(cs):
                    if j.lower() == 'n/a':
                        continue
                    temp.append(j)
                counterSpeech.append(temp)
                lineNumber += 1
        hateCount = len(hateSpeech)
                

    elif datasetName == 'conan':
        dataFile = open ('./data/conan/CONAN.json', 'r')
        fileText = []
        for line in dataFile:
            
            fileText.append(json.loads(line))

        enText =  []
        for item in fileText[0]['conan']:
            if (item['cn_id'][:2] == 'EN'):
                enText.append(item)

        hateSpeech = []
        counterSpeech =[] 
        for item in enText:
            hateSpeech.append(item['hateSpeech'].strip())
            counterSpeech.append([item['counterSpeech'].strip()])
        hateCount = len(hateSpeech)
        dataFile.close()
       
    randomIndex = []
    for num in range(hateCount):
        randomIndex.append(num) 
    random.shuffle(randomIndex)
    trainIndex = sorted(randomIndex[:int(pct*len(randomIndex))])
    trainHate = []
    for i in range(hateCount):
        if (i in trainIndex):
            trainHate.append(hateSpeech[i])
    trainCounter = []
    for i in range(hateCount):
        if (i in trainIndex):
            trainCounter.append(counterSpeech[i])

    testHate = []
    for i in range(hateCount):
        if (i not in trainIndex):
            testHate.append(hateSpeech[i])
    testCounter = []
    for i in range(hateCount):
        if (i not in trainIndex):
            testCounter.append(counterSpeech[i])
    trainCounter = flatten(trainCounter)
    testCounter = flatten(testCounter)
    
    #Flattening 
    return trainHate, trainCounter, testHate, testCounter

#Flatten counter speech
def flatten(lst):
    lstOut = []
    for subLst in lst:
        for val in subLst:
            lstOut.append(val)
    return lstOut
                   

#Helper function for csvs
def splitResponse(strResp):
    result = ast.literal_eval(strResp)
    #print(result)
    retVal = []
    for item in result:
        retVal.append(item)
    return retVal


def main():
    #a, b, c, d = ExtractData('conan')
    #a, b, c, d= ExtractData('reddit')
    a,b,c,d = ExtractData('gab')
if __name__ == "__main__":
    main()

In [None]:
#Train and test samples for counterspeech and hatespeech arguments
trainhate, traincounter, testhate, testcounter = ExtractData('gab')

In [None]:
print(len(traincounter))
print(len(testcounter))

In [None]:
#Creating a train data file

train_gab = pd.DataFrame()
train_gab['counterspeech'] = traincounter
train_gab.head()

In [None]:
#Loading an individual dataset and glove embeddings

TRAIN_DATA_FILE = '../input/gpskaggle/train_gab.csv'
GLOVE_EMBEDDING = '../input/gpskaggle/glove.6B.50d.txt'

#Small set of hyperparameters, they match the ones passed as arguments later on
VALIDATION_SPLIT = 0.2
MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 12000
EMBEDDING_DIM = 50

texts = [] 
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts.append(values[1])
print('Found %s texts in train.csv' % len(texts))

In [None]:
texts[1]

In [None]:
#Tokenizing sentences 
tokenizer = Tokenizer(MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index #The dict values start from 1 so this is fine with zeropadding
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data_1 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data_1.shape)
NB_WORDS = (min(tokenizer.num_words, len(word_index)) + 1 ) #+1 for zero padding
data_1_val = data_1[27291:33291] #Select 6000 sentences as validation data (0.2)
data_train = data_1[:27291]

In [None]:
data_train.shape

In [None]:
data_1_val.shape

In [None]:
#Small helper function to pass sentences in sequential batches for better model fit

def sent_generator(TRAIN_DATA_FILE, chunksize):
    reader = pd.read_csv(TRAIN_DATA_FILE, chunksize=chunksize, iterator=True)
    for df in reader:
        
        val = df.iloc[:,1:2].values.tolist()
        #print(val)
        flat3 = [item for sublist in val for item in sublist]
        #print(flat3)
        flat = [str(item) for sublist in val for item in sublist]
        #print(flat)
        texts = [] 
        texts.extend(flat[:])
        
        sequences = tokenizer.texts_to_sequences(texts)
        data_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
        yield [data_train, data_train]

In [None]:
#Using glove to generate embeddings for tokenized sentences

embeddings_index = {}
f = open(GLOVE_EMBEDDING, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

glove_embedding_matrix = np.zeros((NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

In [None]:
#Hyperparameters passed as arguments

parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=8) 
parser.add_argument('--n_vocab', type=int, default=12000)
parser.add_argument('--epochs', type=int, default=100) # Changed to 100 from 1000
parser.add_argument('--n_hidden_G', type=int, default=512)
parser.add_argument('--n_layers_G', type=int, default=2)
parser.add_argument('--n_hidden_E', type=int, default=512)
parser.add_argument('--n_layers_E', type=int, default=1)
parser.add_argument('--n_z', type=int, default=100)
parser.add_argument('--word_dropout', type=float, default=0.5)
parser.add_argument('--rec_coef', type=float, default=7)
parser.add_argument('--lr', type=float, default=0.00001)
parser.add_argument('--gpu', type=int, default=0)
parser.add_argument('--n_highway_layers', type=int, default=2)
parser.add_argument('--n_embed', type=int, default=50) # Same as EMBEDDING_DIM
parser.add_argument('--out_num', type=int, default=30000)
parser.add_argument('--unk_token', type=str, default="<unk>")
parser.add_argument('--pad_token', type=str, default="<pad>")
parser.add_argument('--start_token', type=str, default="<sos>")
parser.add_argument('--end_token', type=str, default="<eos>")
parser.add_argument('--dataset', type=str, default="reddit")
parser.add_argument('--training', action='store_true')
parser.add_argument('--resume_training', action='store_true')


parameters, unknown = parser.parse_known_args()

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
 
#VAE model definition 

#Placeholder loss
def zero_loss(y, y_hat):
    return backend.zeros_like(y_hat)

x = Input(batch_shape=(None,15))
x_embed = Embedding(NB_WORDS,parameters.n_embed,weights=[glove_embedding_matrix],input_length=15,trainable=False)(x)
h = Bidirectional(LSTM(parameters.n_hidden_G,return_sequences=False,recurrent_dropout=0.2),merge_mode="concat")(x_embed)
h = Dropout(0.2)(h)
mu = Dense(parameters.n_z)(h)
log_var = Dense(parameters.n_z)(h)

def sample(args):
    print ("entered sample function")
    mu, log_var = args
    eps = tf.random.normal(shape=(parameters.batch_size,parameters.n_z),mean=0,stddev=1)
    return mu + tf.exp(0.5*log_var)*eps

z = Lambda(sample,output_shape=(parameters.n_z,))([mu,log_var])
repeat_vector = RepeatVector(15)
decoder_h = LSTM(parameters.n_hidden_E,return_sequences=True,recurrent_dropout=0.2)
decoder_mu = TimeDistributed(Dense(parameters.n_vocab,activation='linear'))
decoded_h = decoder_h(repeat_vector(z))
decoded_mu = decoder_mu(decoded_h)
print ("functioning")
logits = tf.constant(np.random.randn(parameters.batch_size, 15, parameters.n_vocab), tf.float32)
targets = tf.constant(np.random.randint(parameters.n_vocab, size=(parameters.batch_size, 15)), tf.int32)
proj_w = tf.constant(np.random.randn(parameters.n_vocab, parameters.n_vocab), tf.float32)
proj_b = tf.constant(np.zeros(parameters.n_vocab), tf.float32)

def sample_loss(labels, logits):
    print ("sample loss function entered")
    labels = labels.reshape(tf.cast(labels,tf.int64),[-1,1])
    logits = tf.cast(logits, tf.float32)
    return tf.cast(tf.nn_sampled_softmax_loss(
        proj_w, proj_b, labels, logits, num_sampled=500,num_classes = parameters.n_vocab
    ),tf.int32)

softmax_loss = sample_loss


#VAE layers, including custom loss
class VAELayer(Layer):
    def __init__(self, **kwargs):
        super(VAELayer,self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((parameters.batch_size, 15)),tf.float32) 

    def vae_loss(self, x, decoded_mu):
        labels = tf.cast(x, tf.int32)
        recreation_loss = backend.sum(tfa.seq2seq.sequence_loss(
            decoded_mu,labels,weights=self.target_weights,average_across_timesteps=False,
            average_across_batch=False),axis=-1)

        kl_loss = -0.5*backend.sum(1+log_var-backend.square(mu)-backend.exp(log_var))
        return backend.mean(kl_loss + recreation_loss)

    def call(self, inputs):
        print ("call function entered")
        x = inputs[0]
        decoded_mu = inputs[1]
        loss = self.vae_loss(x,decoded_mu)
        self.add_loss(loss,inputs=inputs)
        return backend.zeros_like(x)

loss_layer = VAELayer()([x,decoded_mu])
vae = Model(x,[loss_layer])
opt = tf.keras.optimizers.Adam(lr=parameters.lr) #Adam optimizer as per GPS
vae.compile(optimizer=opt,loss=[zero_loss])
vae.summary()

In [None]:
# steps_per_epoch = int( np.ceil(train_gab.shape[0] / parameters.batch_size) )
print (11000/parameters.batch_size)
print (11000//parameters.batch_size)

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss')

#Model checkpoints after each epoch 
def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + model_name + ".h5" #-{epoch:02d}-{decoded_mean:.2f}
    directory = os.path.dirname(filepath)
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=1, save_best_only=False)
    return checkpointer

checkpointer = create_model_checkpoint('models', 'vae_gps')

nb_epoch=70
n_steps = 30000/parameters.batch_size 
for counter in range(nb_epoch):
    print('-------epoch: ',counter,'--------')
    vae.fit(sent_generator(TRAIN_DATA_FILE, parameters.batch_size),steps_per_epoch=n_steps, epochs=1, callbacks=[checkpointer, early_stopping],validation_data=(data_1_val, data_1_val))

In [None]:
#Save final model as .h5 file

vae.save('./vae_lstm800k32dim96hid.h5')

In [None]:
#Build a model to project sentences on the latent space
encoder = Model(x, mu)

#Build a generator that can sample sentences from the learned distribution
decoder_input = Input(shape=(parameters.n_z,))
_h_decoded = decoder_h(repeat_vector(decoder_input))
_x_decoded_mean = decoder_mu(_h_decoded)
_x_decoded_mean = Activation('softmax')(_x_decoded_mean)
generator = Model(decoder_input, _x_decoded_mean)

In [None]:
#Generating for validation sentences

index2word = {v: k for k, v in word_index.items()}
sent_encoded = encoder.predict(data_1_val, batch_size = 8)
x_test_reconstructed = generator.predict(sent_encoded)
                                         
sent_idx = 672
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx])
#np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
#np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))
word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
print(word_list)
original_sent = list(np.vectorize(index2word.get)(data_1_val[sent_idx]))
print(original_sent)

In [None]:
# Function to parse a sentence
def sent_parse(sentence, mat_shape):
    sequence = tokenizer.texts_to_sequences(sentence)
    padded_sent = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    return padded_sent #[padded_sent, sent_one_hot]


# Input: original dimension sentence vector
# Output: sentence text
def print_latent_sentence(sent_vect):
    
    sent_vect = np.reshape(sent_vect,[1,parameters.n_z])
    #print(sent_vect.shape) 
    sent_reconstructed = generator.predict(sent_vect)
    #print(sent_reconstructed.shape)
    sent_reconstructed = np.reshape(sent_reconstructed,[MAX_SEQUENCE_LENGTH,parameters.n_vocab])
    #print(sent_reconstructed)
    reconstructed_indexes = np.apply_along_axis(np.argmax, 1, sent_reconstructed)
    #print(reconstructed_indexes)
    #np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
    #np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))

    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    w_list = [w for w in word_list if w]
    return ' '.join(w_list)
    #print(word_list)

In [None]:
#Looping a subset of data for generation

ut = []
def convlist(lst):
    return [[el] for el in lst]

inplist = convlist(testcounter)
for item in inplist:
    mysent = sent_parse(item, [15])
    mysent_encoded = encoder.predict(mysent, batch_size = 16)
    x = print_latent_sentence(mysent_encoded)
    #print (type(x))
    out.append(x)

In [None]:
#Testing with a specific instance

sentence1=['Using words that insult one group while defending another group doesnt come across as helpful.']
mysent = sent_parse(sentence1, [15])
mysent_encoded = encoder.predict(mysent, batch_size = 16)
x = print_latent_sentence(mysent_encoded)
x

In [None]:
#Writing generated counterspeech sentences into a text file
with open('./gab.txt', 'w') as f:
    f.write('\r\n'.join(out))