# TCN Classification with Super Dataset

In [None]:
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU')) #"/GPU:0": The first GPU of machine that is visible to TensorFlow.
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import pandas as pd
import numpy as np
import re
import nltk
import random
# from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

# nltk.download('twitter_samples')
print ("Initiated")

## Load the Dataset

In [None]:
corpus = pd.read_csv("super23_train.csv", encoding='latin-1')
corpus.columns =["sentence","label"]
# corpus['label']= corpus['label'].map({'ham': 0, 'spam': 1})
corpus.label = corpus.label.astype(int)
print(corpus.shape)
corpus

In [None]:
# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
sentences[0]

'YOU HAVE WON! As a valued Vodafone customer our computer has picked YOU to win a ?150 prize. To collect is easy. Just call 09061743386'

In [None]:
# Define a function to compute the max length of sequence
def max_length(sequences):
    '''
    input:
        sequences: a 2D list of integer sequences
    output:
        max_length: the max length of the sequences
    '''
    max_length = 0
    for i, seq in enumerate(sequences):
        length = len(seq)
        if max_length < length:
            max_length = length
    return max_length

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"

print("Example of sentence: ", sentences[4])

# Cleaning and Tokenization
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)

# Turn the text into sequence
training_sequences = tokenizer.texts_to_sequences(sentences)
max_len = max_length(training_sequences)

print('Into a sequence of int:', training_sequences[4])

# Pad the sequence to have the same size
training_padded = pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
# print('Into a padded sequence:', training_padded[4])

In [None]:
word_index = tokenizer.word_index
# See the first 10 words in the vocabulary
for i, word in enumerate(word_index):
    print(word, word_index.get(word))
    if i==9:
        break
vocab_size = len(word_index)+1
print("Vocab Size: ",vocab_size)

# Model 1: Embedding Random
<hr>

In [None]:
# !pip install keras-tcn==3.3.0

In [None]:
from tcn import TCN, tcn_full_summary
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model

def define_model(kernel_size = 3, activation='relu', input_dim = None, output_dim=300, max_length = None ):
    
    inp = Input( shape=(max_length,))
    x = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=max_length)(inp)
    x = SpatialDropout1D(0.1)(x)
    
    x = TCN(128,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn1')(x)
    x = TCN(64,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn2')(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile( loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    
    return model

In [None]:
model_0 = define_model(input_dim=1000, max_length=100)
print("Summary: ")
model_0.summary()

In [None]:
# tcn_full_summary(model_0)

In [None]:
# class myCallback(tf.keras.callbacks.Callback):
#     # Overide the method on_epoch_end() for our benefit
#     def on_epoch_end(self, epoch, logs={}):
#         if (logs.get('accuracy') > 0.93):
#             print("\nReached 93% accuracy so cancelling training!")
#             self.model.stop_training=True

callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, 
                                             patience=10, verbose=2, 
                                             mode='auto', restore_best_weights=True)

## Train and Test the Model

In [None]:
import time
# Parameter Initialization
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"
activation = "relu"
kernel_size = [3]

columns = ['Activation', 'Filters', 'acc1', 'acc2', 'acc3', 'acc4', 'acc5', 'AVG']
record = pd.DataFrame(columns = columns)

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
exp = 0
train_x = sentences
train_y = labels
# Turn the labels into a numpy array
train_y = np.array(train_y)
# encode data using
# Cleaning and Tokenization
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
# Turn the text into sequence
training_sequences = tokenizer.texts_to_sequences(train_x)
max_len = max_length(training_sequences)

# Pad the sequence to have the same size
Xtrain = pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

word_index = tokenizer.word_index
vocab_size = len(word_index)+1

# Define the input shape
model = define_model(kernel_size, activation, input_dim=vocab_size, max_length=max_len)

start = time.time()
# Train the model
model.fit(Xtrain, train_y, batch_size=16, epochs=10, verbose=1, callbacks=[callbacks])
stop = time.time()
print(f"Training time per fold: {stop - start}s")

## Summary

In [None]:
corpus1 = pd.read_csv("super23_test.csv", encoding='latin-1')
# corpus1 = pd.read_excel("test_punny.xlsx")
corpus1.columns =["sentence", "label"]
# corpus['label']= corpus['label'].map({'ham': 0, 'spam': 1})
corpus1.label = corpus1.label.astype(int)
print(corpus1.shape)

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)
print(sentences1[0])

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)

test_x1 = sentences1
test_y1 = labels1

test_sequences1 = tokenizer.texts_to_sequences(test_x1)
Xtest1 = pad_sequences(test_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)

print("Messages length", len(test_x1))

(13148, 2)
download whichapp for whatsapp friends to see your friends apps and also save battery by  you have  friend waiting httpbitlyogmdkv
Messages length 13148


In [None]:
start = time.time()
pred_lbl = (model.predict(Xtest1)> 0.5).astype("int32")
stop = time.time()
print(f"classification time: {stop - start}s")
pd.DataFrame(pred_lbl).to_csv('cdar7.csv', index=False)

#pd.DataFrame(Xtest).to_csv('xtest.csv', index=False) # numpy array to CSV
import pandas as pd
messages = pd.read_csv("cdar7.csv", encoding='latin-1')
messages.columns = ["labels"]
print (messages.tail(3))

from sklearn import metrics
print(metrics.classification_report(labels1, messages["labels"]))
print(metrics.confusion_matrix(labels1, messages["labels"]))

# Printing the Overall Accuracy of the model
print("Accuracy of the model : {0:0.3f}".format(metrics.accuracy_score(labels1, messages["labels"])))

classification time: 4.84154748916626s
       labels
13145       0
13146       0
13147       0
              precision    recall  f1-score   support

           0       0.58      0.97      0.72      1918
           1       0.99      0.88      0.93     11230

    accuracy                           0.89     13148
   macro avg       0.79      0.92      0.83     13148
weighted avg       0.93      0.89      0.90     13148

[[1856   62]
 [1361 9869]]
Accuracy of the model : 0.892


In [None]:
# !pip install openpyxl

# Model 2: Word2Vec Static

__Using and updating pre-trained embeddings__
* In this part, we will create an Embedding layer in Tensorflow Keras using a pre-trained word embedding called Word2Vec 300-d tht has been trained 100 bilion words from Google News.
* In this part,  we will leave the embeddings fixed instead of updating them (dynamic).

1. __Load `Word2Vec` Pre-trained Word Embedding__

In [None]:
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
# !gunzip ./GoogleNews-vectors-negative300.bin.gz

In [None]:
# !pip install gensim

In [None]:
import gensim
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Access the dense vector value for the word 'handsome'
# word2vec.word_vec('handsome') # 0.11376953
word2vec.word_vec('cool') # 1.64062500e-01

2. __Check number of training words present in Word2Vec__

In [None]:
def training_words_in_word2vector(word_to_vec_map, word_to_index):
    '''
    input:
        word_to_vec_map: a word2vec GoogleNews-vectors-negative300.bin model loaded using gensim.models
        word_to_index: word to index mapping from training set
    '''
    
    vocab_size = len(word_to_index) + 1
    count = 0
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        if word in word_to_vec_map:
            count+=1
            
    return print('Found {} words present from {} training vocabulary in the set of pre-trained word vector'.format(count, vocab_size))

In [None]:
# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)

# Cleaning and Tokenization
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
training_words_in_word2vector(word2vec, word_index)

Found 19806 words present from 51609 training vocabulary in the set of pre-trained word vector


2. __Define a `pretrained_embedding_layer` function__

In [None]:
emb_mean = word2vec.vectors.mean()
emb_std = word2vec.vectors.std()

In [None]:
from tensorflow.keras.layers import Embedding

def pretrained_embedding_matrix(word_to_vec_map, word_to_index, emb_mean, emb_std):
    '''
    input:
        word_to_vec_map: a word2vec GoogleNews-vectors-negative300.bin model loaded using gensim.models
        word_to_index: word to index mapping from training set
    '''
    np.random.seed(2021)
    
    # adding 1 to fit Keras embedding (requirement)
    vocab_size = len(word_to_index) + 1
    # define dimensionality of your pre-trained word vectors (= 300)
    emb_dim = word_to_vec_map.word_vec('handsome').shape[0]
    
    # initialize the matrix with generic normal distribution values
    embed_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        if word in word_to_vec_map:
            embed_matrix[idx] = word_to_vec_map.get_vector(word)
            
    return embed_matrix

In [None]:
# Test the function
w_2_i = {'<UNK>': 1, 'handsome': 2, 'cool': 3, 'shit': 4 }
em_matrix = pretrained_embedding_matrix(word2vec, w_2_i, emb_mean, emb_std)
em_matrix

## TCN Model

In [None]:
def tcn_model(kernel_size = 3, activation='relu', input_dim = None, 
                   output_dim=300, max_length = None, emb_matrix = None):
    
    inp = Input( shape=(max_length,))
    x = Embedding(input_dim=input_dim, 
                  output_dim=output_dim, 
                  input_length=max_length,
                  # Assign the embedding weight with word2vec embedding marix
                  weights = [emb_matrix],
                  # Set the weight to be not trainable (static)
                  trainable = False)(inp)
    
    x = SpatialDropout1D(0.1)(x)
    
    x = TCN(128,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn1')(x)
    x = TCN(64,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn2')(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile( loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    
    return model

In [None]:
model_0 = tcn_model(input_dim=1000, max_length=100, emb_matrix=np.random.rand(1000, 300)) 
model_0.summary()

## Train and Test the Model

In [None]:
# class myCallback(tf.keras.callbacks.Callback):
#     # Overide the method on_epoch_end() for our benefit
#     def on_epoch_end(self, epoch, logs={}):
#         if (logs.get('accuracy') >= 0.9):
#             print("\nReached 90% accuracy so cancelling training!")
#             self.model.stop_training=True

callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, 
                                             patience=10, verbose=2, 
                                             mode='auto', restore_best_weights=True)

In [None]:
import time
# Parameter Initialization
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"
activation = 'relu'
print('Loading embedding statistics . . .')
emb_mean = emb_mean
emb_std = emb_std
print('Done!')
kernel_size = [3]

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
exp = 0

train_x = sentences
train_y = labels
# Turn the labels into a numpy array
train_y = np.array(train_y)
# encode data using
# Cleaning and Tokenization
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
# Turn the text into sequence
training_sequences = tokenizer.texts_to_sequences(train_x)
max_len = max_length(training_sequences)

# Pad the sequence to have the same size
Xtrain = pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

word_index = tokenizer.word_index
vocab_size = len(word_index)+1
emb_matrix = pretrained_embedding_matrix(word2vec, word_index, emb_mean, emb_std)

# Define the input shape
model2 = tcn_model(kernel_size, activation, input_dim=vocab_size, max_length=max_len, emb_matrix=emb_matrix)

start = time.time()
# Train the model
model2.fit(Xtrain, train_y, batch_size=50, epochs=7, verbose=1, callbacks=[callbacks])
stop = time.time()
print(f"Training time per fold: {stop - start}s")

In [None]:
corpus1 = pd.read_csv("super23_test.csv", encoding='latin-1')
# corpus1 = pd.read_excel("test_punny.xlsx")
corpus1.columns =["sentence", "label"]
# corpus['label']= corpus['label'].map({'ham': 0, 'spam': 1})
corpus1.label = corpus1.label.astype(int)
print(corpus1.shape)

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)
print(sentences1[0])

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)

test_x1 = sentences1
test_y1 = labels1

test_sequences1 = tokenizer.texts_to_sequences(test_x1)
Xtest1 = pad_sequences(test_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)

print("Messages length", len(test_x1))

(13148, 2)
download whichapp for whatsapp friends to see your friends apps and also save battery by  you have  friend waiting httpbitlyogmdkv
Messages length 13148


In [None]:
start = time.time()
pred_lbl = (model2.predict(Xtest1)> 0.5).astype("int32")
stop = time.time()
print(f"classification time: {stop - start}s")
pd.DataFrame(pred_lbl).to_csv('cdar5.csv', index=False)

#pd.DataFrame(Xtest).to_csv('xtest.csv', index=False) # numpy array to CSV
import pandas as pd
messages = pd.read_csv("cdar5.csv", encoding='latin-1')
messages.columns = ["labels"]
print (messages.tail(3))

from sklearn import metrics
print(metrics.classification_report(labels1, messages["labels"]))
print(metrics.confusion_matrix(labels1, messages["labels"]))

# Printing the Overall Accuracy of the model
print("Accuracy of the model : {0:0.3f}".format(metrics.accuracy_score(labels1, messages["labels"])))

# Model 3: Word2Vec - Dynamic

* In this part,  we will fine tune the embeddings while training (dynamic).

## TCN Model

In [None]:
def define_model_3(kernel_size = 3, activation='relu', input_dim = None, 
                   output_dim=300, max_length = None, emb_matrix = None):
    
    inp = Input( shape=(max_length,))
    x = Embedding(input_dim=input_dim, 
                  output_dim=output_dim, 
                  input_length=max_length,
                  # Assign the embedding weight with word2vec embedding marix
                  weights = [emb_matrix],
                  # Set the weight to be not trainable (static)
                  trainable = True)(inp)
    
    x = SpatialDropout1D(0.1)(x)
    
    x = TCN(128,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn1')(x)
    x = TCN(64,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn2')(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile( loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    
    return model

In [None]:
model_0 = define_model_3( input_dim=1000, max_length=100, emb_matrix=np.random.rand(1000, 300))
model_0.summary()

## Train and Test the Model

In [None]:
class myCallback(tf.keras.callbacks.Callback):
    # Overide the method on_epoch_end() for our benefit
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') > 0.93):
            print("\nReached 93% accuracy so cancelling training!")
            self.model.stop_training=True

callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, 
                                             patience=10, verbose=2, 
                                             mode='auto', restore_best_weights=True)

In [None]:
import time
# Parameter Initialization
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"
activations = ['relu']
print('Loading embedding statistics . . .')
emb_mean = emb_mean
emb_std = emb_std
print('Done!')
kernel_sizes = [3]

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
exp = 0

train_x = sentences
train_y = labels
# Turn the labels into a numpy array
train_y = np.array(train_y)
# encode data using
# Cleaning and Tokenization
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
# Turn the text into sequence
training_sequences = tokenizer.texts_to_sequences(train_x)
max_len = max_length(training_sequences)

# Pad the sequence to have the same size
Xtrain = pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

word_index = tokenizer.word_index
vocab_size = len(word_index)+1
emb_matrix = pretrained_embedding_matrix(word2vec, word_index, emb_mean, emb_std)

# Define the input shape
model3 = define_model_3(kernel_size, activation, input_dim=vocab_size, max_length=max_len, emb_matrix=emb_matrix)

start = time.time()
# Train the model
model3.fit(Xtrain, train_y, batch_size=50, epochs=7, verbose=1, callbacks=[callbacks])
stop = time.time()
print(f"Training time per fold: {stop - start}s")

## Testing the model

In [None]:
corpus1 = pd.read_csv("super23_legacy.csv", encoding='latin-1')
# corpus1 = pd.read_excel("test_punny.xlsx")
corpus1.columns =["sentence", "label"]
# corpus['label']= corpus['label'].map({'ham': 0, 'spam': 1})
corpus1.label = corpus1.label.astype(int)
print(corpus1.shape)

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)
print(sentences1[0])

# Separate the sentences and the labels
sentences1, labels1 = list(corpus1.sentence), list(corpus1.label)

test_x1 = sentences1
test_y1 = labels1

test_sequences1 = tokenizer.texts_to_sequences(test_x1)
Xtest1 = pad_sequences(test_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)

print("Messages length", len(test_x1))

(37615, 2)
YOU HAVE WON! As a valued Vodafone customer our computer has picked YOU to win a ?150 prize. To collect is easy. Just call 09061743386
Messages length 37615


In [None]:
start = time.time()
pred_lbl = (model3.predict(Xtest1)> 0.5).astype("int32")
stop = time.time()
print(f"classification time: {stop - start}s")
pd.DataFrame(pred_lbl).to_csv('cdar6.csv', index=False)

#pd.DataFrame(Xtest).to_csv('xtest.csv', index=False) # numpy array to CSV
import pandas as pd
messages = pd.read_csv("cdar6.csv", encoding='latin-1')
messages.columns = ["labels"]
print (messages.tail(3))

from sklearn import metrics
print(metrics.classification_report(labels1, messages["labels"]))
print(metrics.confusion_matrix(labels1, messages["labels"]))

# Printing the Overall Accuracy of the model
print("Accuracy of the model : {0:0.3f}".format(metrics.accuracy_score(labels1, messages["labels"])))

classification time: 12.64870023727417s
       labels
37612       0
37613       0
37614       0
              precision    recall  f1-score   support

           0       1.00      0.80      0.89     36925
           1       0.08      0.97      0.15       690

    accuracy                           0.81     37615
   macro avg       0.54      0.88      0.52     37615
weighted avg       0.98      0.81      0.88     37615

[[29655  7270]
 [   24   666]]
Accuracy of the model : 0.806
