In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
d = pd.read_csv("labelled_dataset_temp.csv")

In [None]:
tweetlist = d['text'].tolist()

In [None]:
import re
contractions = { 
 "ain't":"am not","aren't":"are not","can't":"cannot","can't've":"cannot have","'cause":"because","could've":"could have","couldn't":"could not","couldn't've":"could not have","didn't":"did not","doesn't":"does not","don't":"do not","hadn't":"had not","hadn't've":"had not have","hasn't":"has not","haven't":"have not","he'd":"he had","he'd've":"he would have","he'll":"he will","he'll've":"he will have","he's":"he has","how'd":"how did","how'd'y":"how do you","how'll":"how will","how's":"how is","I'd":"I would","I'd've":"I would have","I'll":"I will","I'll've":"I will have","I'm":"I am","I've":"I have","isn't":"is not","it'd":"it would","it'd've":"it would have","it'll":"it will","it'll've":"it will have","it's":"it is","let's":"let us","ma'am":"madam","mayn't":"may not","might've":"might have","mightn't":"might not","mightn't've":"might not have","must've":"must have","mustn't":"must not","mustn't've":"must not have","needn't":"need not","needn't've":"need not have","o'clock":"of the clock","oughtn't":"ought not","oughtn't've":"ought not have","shan't":"shall not","sha'n't":"shall not","shan't've":"shall not have","she'd":"she would","she'd've":"she would have","she'll":"she will","she'll've":"he will have","she's":"she is","should've":"should have","shouldn't":"should not","shouldn't've":"should not have","so've":"so have","so's":"so is","that'd":"that would","that'd've":"that would have","that's":"that is","there'd":"there would","there'd've":"there would have","there's":"there is","they'd":"tthey would","they'd've":"they would have","they'll":"they will","they'll've":"they will have","they're":"they are","they've":"they have","to've":"to have","wasn't":"was not","we'd":"we would","we'd've":"we would have","we'll":"we will","we'll've":"we will have","we're":"we are","we've":"we have","weren't":"were not","what'll":"what will","what'll've":"what will have","what're":"what are","what's":"what is","what've":"what have","when's":"when is","when've":"when have","where'd":"where did","where's":"where is","where've":"where have","who'll":"who will","who'll've":"who will have","who's":"who is","who've":"who have","why's":"why is","why've":"why have","will've":"will have","won't":"will not","won't've":"will not have","would've":"would have","wouldn't":"would not","wouldn't've":"would not have","y'all":"you all","y'all'd":"you all would","y'all'd've":"you all would have","y'all're":"you all are","y'all've":"you all have","you'd":"you would","you'd've":"you would have","you'll":"you will","you'll've":"you will have","you're":"you are","you've":"you have"
}
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))

def expand_contractions(s, contractions_dict=contractions):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
import re
def clean(tweet):
    # remove hashtag
    tweet = re.sub(r'@\w+:?',' ',tweet)
    # remove retweet symbol
    tweet = re.sub(r'^RT',' ',tweet)
    # remove hashtags
    tweet = re.sub(r'#\w+',' ',tweet)
    # remove URL
    tweet = re.sub(r'https\S+',' ',tweet)
    # replace ’ with '
    tweet = re.sub(r'’',"'",tweet)
    # expand contraction
    tweet = expand_contractions(tweet)
    # remove non-word character
    tweet = re.sub(r'[^\w\s]',' ',tweet)
    # remove extra space
    tweet = re.sub(r'\s+',' ',tweet)
    # remove amp
    tweet = re.sub(r'\samp\s',' ',tweet)
    # convert to lower case and strip leading and trailing spaces
    tweet = tweet.lower().strip()
    # tokenizing
    words = tokenizer.tokenize(tweet)
    # remove words having numbers
    words = [w for w in words if re.search(r'[0-9]',w) == None]
    return words

In [None]:
# pasting the cleaned_tweetlist back in the dataframe
cleaned_tweetlist = list(map(lambda x: clean(x), tweetlist))
d['cleaned']=cleaned_tweetlist

In [None]:
# getting rid of very small length tweets
d = d[d['cleaned'].apply(lambda x:len(x)>2)]
cleaned_tweetlist = d['cleaned'].tolist()

In [None]:
# from itertools import compress
# mask = list(map(lambda x: len(x)>10, cleaned_tweetlist))
# cleaned_tweetlist = list(compress(cleaned_tweetlist,mask))

In [None]:
# create document vectors using gensim doc2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(cleaned_tweetlist)]
model_d2v = Doc2Vec(vector_size=32, window=3, min_count=4, workers=4, negative = 10, epochs=40)
model_d2v.build_vocab(documents)
model_d2v.train(documents, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [None]:
document_vectors_all = list(map(lambda p:model_d2v.infer_vector(p),cleaned_tweetlist))
d['docvec'] = document_vectors_all

From here we will split 20% of the data and use it for test, since these are the labelled data.

With rest of the data, the hatespeeches among them will be fed to GAN to generate more document vectors

In [None]:
test = d.sample(frac=0.2)
rest = pd.concat([d,test]).drop_duplicates(subset=["tweet_id"],keep=False)
document_vectors = rest[rest['hate'].apply(lambda x:x==1.0)].docvec.tolist()

<h1>Generation of Document Vectors using GAN</h1>

In [None]:
# Generation of Document Vectors using GAN
import numpy as np
import tensorflow as tf

train_documents = np.array(document_vectors)
train_documents = train_documents.reshape(train_documents.shape[0],32,1)
# Elements are already normalized to [-1, 1]

In [None]:
s_buffer = 1000
s_batch = 256
train_dataset = tf.data.Dataset.from_tensor_slices(train_documents).shuffle(s_buffer).batch(s_batch)

In [None]:
# Now let us define the generator
# Generator will accept (100,) noise and will generate a document vector of length 32
from tensorflow.keras import layers
def gen_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(8*256, use_bias=False, input_shape=(100,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Reshape((8,256)))
    assert model.output_shape == (None, 8, 256)

    model.add(layers.Conv1DTranspose(filters=128, kernel_size=5, strides=1, padding='same', use_bias=False))
    assert model.output_shape == (None, 8, 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv1DTranspose(filters=64, kernel_size=5, strides=2, padding='same', use_bias=False))
    assert model.output_shape == (None, 16, 64)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    # tanh activation as output needs to be in range [-1,1]
    model.add(layers.Conv1DTranspose(filters=1, kernel_size=5, strides=2, padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, 32, 1)

    return model


In [None]:
generator = gen_model()
random_noise = tf.random.normal([1, 100])
generated_document = generator(random_noise, training=False)
generated_document[0, :, 0]

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([-0.06277785, -0.03147944, -0.1161528 ,  0.04554995,  0.02325949,
       -0.12488685, -0.09861955, -0.06390463, -0.12079295, -0.08284715,
       -0.04897938, -0.10832902,  0.0880328 , -0.02016436, -0.23562035,
       -0.01801194,  0.09127183,  0.18714127,  0.03508518, -0.01288858,
        0.03300442,  0.02713495, -0.04917263, -0.12127037,  0.07037178,
        0.12309948, -0.04171433, -0.02358885,  0.13083865,  0.11986753,
        0.02226896,  0.06582754], dtype=float32)>

In [None]:
# we will create a discriminator here
# the discriminator is a document classifier
def disc_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv1D(filters=64, kernel_size=5, strides=2, padding='same', input_shape=[32, 1]))

    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Conv1D(filters=128, kernel_size=5, strides=2, padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Flatten())
    model.add(layers.Dense(1))

    return model

In [None]:
discriminator = disc_model()
decision = discriminator(generated_document)
print(decision)

tf.Tensor([[0.00196937]], shape=(1, 1), dtype=float32)


In [None]:
# we will use cross_entropy loss
cross_entropy_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
# discriminator loss
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy_loss(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy_loss(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss
# generator loss
def generator_loss(fake_output):
    return cross_entropy_loss(tf.ones_like(fake_output), fake_output)

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
epochs = 50
noise_dim = 100
n_generate = 4000
seed = tf.random.normal([n_generate, noise_dim])

In [None]:
@tf.function
def training_step(documents):
    noise = tf.random.normal([s_batch, noise_dim])
    with tf.GradientTape() as gtape, tf.GradientTape() as dtape:
        generated_documents = generator(noise, training=True)
        real_output = discriminator(documents, training=True)
        fake_output = discriminator(generated_documents, training=True)
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
    
    grad_gen = gtape.gradient(gen_loss, generator.trainable_variables)
    grad_disc = dtape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(grad_gen, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(grad_disc, discriminator.trainable_variables))

In [None]:
def generate_docvecs(model, epoch, test_input):
  predictions = model(test_input, training=False)
  return predictions

In [None]:
# This is the training loop
import time
def train(dataset, epochs):
  for epoch in range(epochs):
    start = time.time()
    for document_batch in dataset:
      training_step(document_batch)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

  # Generate after the final epoch
  return generate_docvecs(generator, epochs, seed)

In [None]:
generated_document_vectors = train(train_dataset, epochs)

Time for epoch 1 is 1.8609263896942139 sec
Time for epoch 2 is 0.010711193084716797 sec
Time for epoch 3 is 0.010044336318969727 sec
Time for epoch 4 is 0.009728670120239258 sec
Time for epoch 5 is 0.01110076904296875 sec
Time for epoch 6 is 0.011414527893066406 sec
Time for epoch 7 is 0.010553836822509766 sec
Time for epoch 8 is 0.012979745864868164 sec
Time for epoch 9 is 0.009850740432739258 sec
Time for epoch 10 is 0.009933233261108398 sec
Time for epoch 11 is 0.00986170768737793 sec
Time for epoch 12 is 0.010763168334960938 sec
Time for epoch 13 is 0.009564399719238281 sec
Time for epoch 14 is 0.009765386581420898 sec
Time for epoch 15 is 0.009802579879760742 sec
Time for epoch 16 is 0.00973057746887207 sec
Time for epoch 17 is 0.009823083877563477 sec
Time for epoch 18 is 0.009493589401245117 sec
Time for epoch 19 is 0.010235786437988281 sec
Time for epoch 20 is 0.011646270751953125 sec
Time for epoch 21 is 0.014807939529418945 sec
Time for epoch 22 is 0.013098478317260742 sec
Ti

In [None]:
# these are generated document vectors
generated_document_vectors = np.reshape(generated_document_vectors.numpy(),(n_generate,32))

In [None]:
data = list(zip(generated_document_vectors,np.full(n_generate, 1.0)))
generated = pd.DataFrame(data, columns =['docvec', 'hate'])
original = rest[['docvec','hate']]
training_dataset = pd.concat([original,generated])
test_dataset = test[['docvec','hate']]

<h1>Building a Classifier</h1>

In [None]:
x_train = training_dataset['docvec'].to_list()
x_train = np.reshape(x_train,(len(x_train),1,32))
x_test = test_dataset['docvec'].to_list()
x_test = np.reshape(x_test,(len(x_test),1,32))
y_train = np.array(training_dataset['hate'].to_list(),dtype="uint8")
y_test = np.array(test_dataset['hate'].to_list(),dtype="uint8")

In [None]:
y_test.shape

(392,)

Now let us define the classification model

In [None]:
def classification_model():
      model = tf.keras.Sequential()
      model.add(layers.Dense(units = 64,input_shape=(1,32),activation='relu'))
      model.add(layers.Dense(units=96, activation='relu'))
      model.add(layers.Dense(units=48, activation='relu'))
      model.add(layers.Dense(units=24, activation='relu'))
      model.add(layers.Dense(units=12, activation='relu'))
      model.add(layers.Dense(units=1, activation='sigmoid'))
      return model

In [None]:
c_model = classification_model()
c_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1, 64)             2112      
_________________________________________________________________
dense_3 (Dense)              (None, 1, 96)             6240      
_________________________________________________________________
dense_4 (Dense)              (None, 1, 48)             4656      
_________________________________________________________________
dense_5 (Dense)              (None, 1, 24)             1176      
_________________________________________________________________
dense_6 (Dense)              (None, 1, 12)             300       
_________________________________________________________________
dense_7 (Dense)              (None, 1, 1)              13        
Total params: 14,497
Trainable params: 14,497
Non-trainable params: 0
__________________________________________________

Defining a loss function

In [None]:
loss_fn = tf.keras.losses.BinaryCrossentropy()
c_model.compile(optimizer='adam',loss=loss_fn, metrics=['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])
c_model.fit(x_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fdd39a0d5d0>

In [None]:
y_pre = c_model.predict(x_test)[:,0,0]
y_pre = (y_pre>0.5).astype(int)

In [None]:
tf.math.confusion_matrix(y_test,y_pre)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[309,  39],
       [ 29,  15]], dtype=int32)>

In [None]:
c_model.evaluate(x_test,y_test)



[0.6473866105079651,
 0.8265306353569031,
 0.2777777910232544,
 0.34090909361839294]