In [1]:
import numpy        as np
import pandas       as pd
import tensorflow   as tf

import datetime
import gc
import operator
import sklearn 
import re

from mlxtend.plotting                           import plot_confusion_matrix
from gensim.models                              import KeyedVectors
from tqdm                                       import tqdm
from sklearn                                    import metrics 
from sklearn.metrics                            import confusion_matrix
from sklearn.model_selection                    import train_test_split

from tensorflow                                 import keras
from tensorflow.keras                           import layers
from tensorflow.keras.layers                    import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D,GRU,Bidirectional, GlobalMaxPool1D 
from tensorflow.keras.models                    import Model,load_model, Sequential
from tensorflow.keras.preprocessing.sequence    import pad_sequences
from tensorflow.keras.utils                     import plot_model
from tensorflow.keras.callbacks                 import ModelCheckpoint,EarlyStopping
from tensorflow.keras.optimizers                import Adam
from tensorboard.plugins                        import projector

In [2]:
def Preprocess(doc):
    corpus=[]
    for text in tqdm(doc):
        text=" ".join([contraction_fix(w) for w in text.split()])
        text=re.sub(r'[^a-z0-9A-Z]'," ",text)
        text=re.sub(r'[0-9]{1}',"#",text)
        text=re.sub(r'[0-9]{2}','##',text)
        text=re.sub(r'[0-9]{3}','###',text)
        text=re.sub(r'[0-9]{4}','####',text)
        text=re.sub(r'[0-9]{5,}','#####',text)
        corpus.append(text)
    return corpus

def contraction_fix(word):
    try:
        a=contractions[word]
    except KeyError:
        a=word
    return a

def get_vocab(corpus):
    vocab={}
    for text in tqdm(corpus):
        for word in text.split():
            try:
                vocab[word]+=1
            except KeyError:
                vocab[word]=1
    vocab=dict(sorted(vocab.items(),reverse=True ,key=lambda item: item[1]))
    return vocab

def get_word_index(vocab):
    word_index=dict((w,i+1) for i,w in enumerate(vocab.keys()))
    return word_index

def fit_one_hot(word_index,corpus):
    sent=[]
    for text in tqdm(corpus):
        li=[]
        for word in text.split():
            try:
                li.append(word_index[word])
            except KeyError:
                li.append(0)
        sent.append(li)
    return sent

In [4]:
def eval_lstm(all_data):
    MAX_FEATURES = 3000
    MAX_LENGTH = 40 
    EMBEDDING_SIZE = 128
    HIDDEN_LAYER_SIZE = 64
    BATCH_SIZE = 512
    NUM_EPOCHS = 5
    FEATURE_VEC=300
    VERBOSE = False

    train, ds_set = train_test_split(all_data, test_size=0.3, random_state=42)
    test, validation = train_test_split(ds_set, test_size=0.3, random_state=42)

    contractions = {"I'm": 'I am', "I'm'a": 'I am about to', "I'm'o": 'I am going to', "I've": 'I have', "I'll": 'I will', "I'll've": 'I will have', "I'd": 'I would', "I'd've": 'I would have', 'Whatcha': 'What are you', "amn't": 'am not', "ain't": 'are not', "aren't": 'are not', "'cause": 'because', "can't": 'can not', "can't've": 'can not have', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "daren't": 'dare not', "daresn't": 'dare not', "dasn't": 'dare not', "didn't": 'did not', 'didn’t': 'did not', "don't": 'do not', 'don’t': 'do not', "doesn't": 'does not', "e'er": 'ever', "everyone's": 'everyone is', 'finna': 'fixing to', 'gimme': 'give me', "gon't": 'go not', 'gonna': 'going to', 'gotta': 'got to', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he've": 'he have', "he's": 'he is', "he'll": 'he will', "he'll've": 'he will have', "he'd": 'he would', "he'd've": 'he would have', "here's": 'here is', "how're": 'how are', "how'd": 'how did', "how'd'y": 'how do you', "how's": 'how is', "how'll": 'how will', "isn't": 'is not', "it's": 'it is', "'tis": 'it is', "'twas": 'it was', "it'll": 'it will', "it'll've": 'it will have', "it'd": 'it would', "it'd've": 'it would have', 'kinda': 'kind of', "let's": 'let us', 'luv': 'love', "ma'am": 'madam', "may've": 'may have', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "ne'er": 'never', "o'": 'of', "o'clock": 'of the clock', "ol'": 'old', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "o'er": 'over', "shan't": 'shall not', "sha'n't": 'shall not', "shalln't": 'shall not', "shan't've": 'shall not have', "she's": 'she is', "she'll": 'she will', "she'd": 'she would', "she'd've": 'she would have', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so is', "somebody's": 'somebody is', "someone's": 'someone is', "something's": 'something is', 'sux': 'sucks', "that're": 'that are', "that's": 'that is', "that'll": 'that will', "that'd": 'that would', "that'd've": 'that would have', 'em': 'them', "there're": 'there are', "there's": 'there is', "there'll": 'there will', "there'd": 'there would', "there'd've": 'there would have', "these're": 'these are', "they're": 'they are', "they've": 'they have', "they'll": 'they will', "they'll've": 'they will have', "they'd": 'they would', "they'd've": 'they would have', "this's": 'this is', "those're": 'those are', "to've": 'to have', 'wanna': 'want to', "wasn't": 'was not', "we're": 'we are', "we've": 'we have', "we'll": 'we will', "we'll've": 'we will have', "we'd": 'we would', "we'd've": 'we would have', "weren't": 'were not', "what're": 'what are', "what'd": 'what did', "what've": 'what have', "what's": 'what is', "what'll": 'what will', "what'll've": 'what will have', "when've": 'when have', "when's": 'when is', "where're": 'where are', "where'd": 'where did', "where've": 'where have', "where's": 'where is', "which's": 'which is', "who're": 'who are', "who've": 'who have', "who's": 'who is', "who'll": 'who will', "who'll've": 'who will have', "who'd": 'who would', "who'd've": 'who would have', "why're": 'why are', "why'd": 'why did', "why've": 'why have', "why's": 'why is', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'all're": 'you all are', "y'all've": 'you all have', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "you're": 'you are', "you've": 'you have', "you'll've": 'you shall have', "you'll": 'you will', "you'd": 'you would', "you'd've": 'you would have', 'jan.': 'january', 'feb.': 'february', 'mar.': 'march', 'apr.': 'april', 'jun.': 'june', 'jul.': 'july', 'aug.': 'august', 'sep.': 'september', 'oct.': 'october', 'nov.': 'november', 'dec.': 'december', 'I’m': 'I am', 'I’m’a': 'I am about to', 'I’m’o': 'I am going to', 'I’ve': 'I have', 'I’ll': 'I will', 'I’ll’ve': 'I will have', 'I’d': 'I would', 'I’d’ve': 'I would have', 'amn’t': 'am not', 'ain’t': 'are not', 'aren’t': 'are not', '’cause': 'because', 'can’t': 'can not', 'can’t’ve': 'can not have', 'could’ve': 'could have', 'couldn’t': 'could not', 'couldn’t’ve': 'could not have', 'daren’t': 'dare not', 'daresn’t': 'dare not', 'dasn’t': 'dare not', 'doesn’t': 'does not', 'e’er': 'ever', 'everyone’s': 'everyone is', 'gon’t': 'go not', 'hadn’t': 'had not', 'hadn’t’ve': 'had not have', 'hasn’t': 'has not', 'haven’t': 'have not', 'he’ve': 'he have', 'he’s': 'he is', 'he’ll': 'he will', 'he’ll’ve': 'he will have', 'he’d': 'he would', 'he’d’ve': 'he would have', 'here’s': 'here is', 'how’re': 'how are', 'how’d': 'how did', 'how’d’y': 'how do you', 'how’s': 'how is', 'how’ll': 'how will', 'isn’t': 'is not', 'it’s': 'it is', '’tis': 'it is', '’twas': 'it was', 'it’ll': 'it will', 'it’ll’ve': 'it will have', 'it’d': 'it would', 'it’d’ve': 'it would have', 'let’s': 'let us', 'ma’am': 'madam', 'may’ve': 'may have', 'mayn’t': 'may not', 'might’ve': 'might have', 'mightn’t': 'might not', 'mightn’t’ve': 'might not have', 'must’ve': 'must have', 'mustn’t': 'must not', 'mustn’t’ve': 'must not have', 'needn’t': 'need not', 'needn’t’ve': 'need not have', 'ne’er': 'never', 'o’': 'of', 'o’clock': 'of the clock', 'ol’': 'old', 'oughtn’t': 'ought not', 'oughtn’t’ve': 'ought not have', 'o’er': 'over', 'shan’t': 'shall not', 'sha’n’t': 'shall not', 'shalln’t': 'shall not', 'shan’t’ve': 'shall not have', 'she’s': 'she is', 'she’ll': 'she will', 'she’d': 'she would', 'she’d’ve': 'she would have', 'should’ve': 'should have', 'shouldn’t': 'should not', 'shouldn’t’ve': 'should not have', 'so’ve': 'so have', 'so’s': 'so is', 'somebody’s': 'somebody is', 'someone’s': 'someone is', 'something’s': 'something is', 'that’re': 'that are', 'that’s': 'that is', 'that’ll': 'that will', 'that’d': 'that would', 'that’d’ve': 'that would have', 'there’re': 'there are', 'there’s': 'there is', 'there’ll': 'there will', 'there’d': 'there would', 'there’d’ve': 'there would have', 'these’re': 'these are', 'they’re': 'they are', 'they’ve': 'they have', 'they’ll': 'they will', 'they’ll’ve': 'they will have', 'they’d': 'they would', 'they’d’ve': 'they would have', 'this’s': 'this is', 'those’re': 'those are', 'to’ve': 'to have', 'wasn’t': 'was not', 'we’re': 'we are', 'we’ve': 'we have', 'we’ll': 'we will', 'we’ll’ve': 'we will have', 'we’d': 'we would', 'we’d’ve': 'we would have', 'weren’t': 'were not', 'what’re': 'what are', 'what’d': 'what did', 'what’ve': 'what have', 'what’s': 'what is', 'what’ll': 'what will', 'what’ll’ve': 'what will have', 'when’ve': 'when have', 'when’s': 'when is', 'where’re': 'where are', 'where’d': 'where did', 'where’ve': 'where have', 'where’s': 'where is', 'which’s': 'which is', 'who’re': 'who are', 'who’ve': 'who have', 'who’s': 'who is', 'who’ll': 'who will', 'who’ll’ve': 'who will have', 'who’d': 'who would', 'who’d’ve': 'who would have', 'why’re': 'why are', 'why’d': 'why did', 'why’ve': 'why have', 'why’s': 'why is', 'will’ve': 'will have', 'won’t': 'will not', 'won’t’ve': 'will not have', 'would’ve': 'would have', 'wouldn’t': 'would not', 'wouldn’t’ve': 'would not have', 'y’all': 'you all', 'y’all’re': 'you all are', 'y’all’ve': 'you all have', 'y’all’d': 'you all would', 'y’all’d’ve': 'you all would have', 'you’re': 'you are', 'you’ve': 'you have', 'you’ll’ve': 'you shall have', 'you’ll': 'you will', 'you’d': 'you would', 'you’d’ve': 'you would have'}

    vocab = get_vocab(train.text)
    top_feat = dict(list(vocab.items())[:MAX_FEATURES])
    word_index = get_word_index(top_feat)
    len(word_index.keys())

    encoded_docs    = fit_one_hot(word_index,train.text)
    padded_doc      = pad_sequences(encoded_docs,maxlen=MAX_LENGTH,padding="post")

    train_text      = Preprocess(train.text)
    test_text       = Preprocess(test.text)
    validation_text = Preprocess(validation.text)

    vocab       = get_vocab(train_text)
    top_feat    = dict(list(vocab.items())[:MAX_FEATURES])
    word_index  = get_word_index(top_feat)

    encoded_docs = fit_one_hot(word_index,train_text)
    train_padded = pad_sequences(encoded_docs,maxlen=MAX_LENGTH,padding="post")

    encoded_docs = fit_one_hot(word_index,test_text)
    test_padded  = pad_sequences(encoded_docs,maxlen=MAX_LENGTH,padding="post")

    encoded_docs        = fit_one_hot(word_index,validation_text)
    validation_padded   = pad_sequences(encoded_docs,maxlen=MAX_LENGTH,padding="post")

    inp = Input(shape=(MAX_LENGTH,))
    x = Embedding(MAX_FEATURES + 1, FEATURE_VEC)(inp)
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    x = Conv1D(64,5,activation="relu")(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.2)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)

    print(model.summary())
    ### defining some callbacks
    opt=Adam(learning_rate=0.002)
    bin_loss=tf.keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.2, name='binary_crossentropy')

    ## defining the call backs

    #see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
    early_stopping=tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=3,mode="min",restore_best_weights=True)

    ### Now reducing the learning rate when the model is not improvinig 
    reduce_lr=tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",factor=0.2,patience=2, verbose=1,  mode="auto")

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    my_callbacks=[early_stopping,reduce_lr, tensorboard_callback]
    model.compile(loss=bin_loss, optimizer=opt, metrics=['accuracy'])
    #with GPU/CPU
        
    history = model.fit(train_padded, train.target, 
                            batch_size=BATCH_SIZE,  #512
                            epochs=NUM_EPOCHS,
                            validation_data=(test_padded, test.target),
                            callbacks=my_callbacks)
    len(history.history['val_loss'])

    # Plot history: MAE
    import matplotlib.pyplot as plt
    plt.plot(history.history['loss'], label='MAE (training data)')
    plt.plot(history.history['val_loss'], label='MAE (test data)')
    plt.title('MAE for Clickbait')
    plt.ylabel('MAE value')
    plt.xlabel('No. epoch')
    plt.legend(loc="upper left")
    plt.show()

    score = model.evaluate(test_padded, test.target, verbose=1)
    print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

    #Check the model against the holdout set
    validation_results = model.predict(validation_padded)
    target = validation.target.array
    right = 0
    wrong = 0
    total = len(validation)

    for i in range(total):
        guessed = int(np.round((validation_results[i]),0))
        if target[i] == guessed:
            right += 1
        else:
            wrong += 1
            if VERBOSE:
                print(f"Text: {validation_text[i]}\n\tGuessed Wrong: {guessed}({validation_results[i]}), Actual: {target[i]}")

    print(f'Right: {right}, Wrong: {wrong}, Accuracy {right*100/total:2.2f}%')

    #%time
    s1 = pd.merge(  train, validation, how='inner', on=['text'])
    s2 = pd.merge( validation, test, how='inner', on=['text'])
    print(f'Interractions between train and validation: {s1.size}')
    print(f'Interractions between test and validation: {s1.size}')

    preds = [round(i[0]) for i in model.predict(test_padded)]
    cm = confusion_matrix(test.target, preds)
    plt.figure()
    plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
    plt.xticks(range(2), ['Not clickbait', 'Clickbait'], fontsize=16)
    plt.yticks(range(2), ['Not clickbait', 'Clickbait'], fontsize=16)
    plt.show()