<a href="https://colab.research.google.com/github/r-dube/CICIDS/blob/main/fj_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# Load the modules used
import numpy as np
import scipy as sci
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Input
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant 
from keras.optimizers import Adam
from keras import metrics
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import tensorflow as tf

In [3]:
# NLTK to remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# list devices
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6379335709987962474
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 6935497038647223039
physical_device_desc: "device: XLA_CPU device"
]


In [5]:
# For reproducible results
# except for variability introduced by GPU
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '42'
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # avoid using GPU for reproducible results
np.random.seed(42)
rn.seed(42)
tf.random.set_seed(42)

In [6]:
# For transformers
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
# Set data_url, the location of the data
# Data is not loaded from a local file
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_small.csv"
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_medium.csv"
data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fake_job_postings.csv"

In [8]:
def fj_load_df_from_url():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        dataframe
    """

    df = pd.read_csv(data_url)

    print ('Loaded dataframe shape', df.shape)

    counts = fj_label_stats(df)
    print ('Not fraudulent', counts[0], 'Fraudulent', counts[1])

    print(df.describe())

    print ('NAs/NANs in data =>')
    print(df.isna().sum())

    return df

def fj_label_stats(df):
    """
    Very basic label statistics
    Input: 
        Dataframe
    Returns:
        Number of samples with 0, 1 as the label
    """
    counts = np.bincount(df['fraudulent'])
    return counts

def fj_txt_only(df):
    """
    Combine all the text fields, discard everything else except for the label
    Input: 
        Dataframe
    Returns:
        Processed dataframe
    """
    
    df.fillna(" ", inplace = True)

    df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + \
    ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + \
    df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] + \
    ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function'] 

    del df['title']
    del df['location']
    del df['department']
    del df['company_profile']
    del df['description']
    del df['requirements']
    del df['benefits']
    del df['employment_type']
    del df['required_experience']
    del df['required_education']
    del df['industry']
    del df['function']  
    
    del df['salary_range']
    del df['job_id']
    del df['telecommuting']
    del df['has_company_logo']
    del df['has_questions']

    return df

In [9]:
df = fj_load_df_from_url()
df = fj_txt_only(df)
print('Maximum text length', df['text'].str.len().max())

Loaded dataframe shape (17880, 18)
Not fraudulent 17014 Fraudulent 866
             job_id  telecommuting  ...  has_questions    fraudulent
count  17880.000000   17880.000000  ...   17880.000000  17880.000000
mean    8940.500000       0.042897  ...       0.491723      0.048434
std     5161.655742       0.202631  ...       0.499945      0.214688
min        1.000000       0.000000  ...       0.000000      0.000000
25%     4470.750000       0.000000  ...       0.000000      0.000000
50%     8940.500000       0.000000  ...       0.000000      0.000000
75%    13410.250000       0.000000  ...       1.000000      0.000000
max    17880.000000       1.000000  ...       1.000000      1.000000

[8 rows x 5 columns]
NAs/NANs in data =>
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telec

In [10]:
# Utilities to clean text

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [11]:
stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

In [12]:
# clean text
df['text'] = df['text'].map(lambda x: remove_URL(x))
df['text'] = df['text'].map(lambda x: remove_html(x))
df['text'] = df['text'].map(lambda x: remove_emoji(x))
df['text'] = df['text'].map(lambda x: remove_punct(x))
df['text'] = df["text"].map(remove_stopwords)

In [13]:
# train-test split
train_text, test_text, train_labels , test_labels = train_test_split(df['text'], df['fraudulent'] , test_size = 0.15)

In [14]:
# Max number of words in a sequence
maxlen = 250

# embedding size to be created
# This depends on the GLOVE file loaded earlier
embed_dim = 50

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index
vocab_size = len(word_index)
print('Found %s unique tokens.' % vocab_size)
vocab_size = vocab_size + 1

train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(
    train_sequences, maxlen=maxlen, padding="post", truncating="post"
)

test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(
    test_sequences, maxlen=maxlen, padding="post", truncating="post"
)

print(f"Shape of train {train_padded.shape}")
print(f"Shape of test {test_padded.shape}")

Found 157210 unique tokens.
Shape of train (15198, 250)
Shape of test (2682, 250)


In [15]:
# Implement multi head self attention as a Keras layer
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [16]:
# Implement a Transformer block as a layer
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [17]:
# Implement embedding layer
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [18]:
# Create classifier model using transformer layer
# embed_dim = 32 # defined above  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model3 = keras.Model(inputs=inputs, outputs=outputs)

In [19]:
model3.compile("adam", "binary_crossentropy", metrics=["accuracy", metrics.FalsePositives(), metrics.FalseNegatives()])
model3.summary()
model3.fit(train_padded, train_labels, epochs=5)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
token_and_position_embedding (None, 250, 50)           7873050   
_________________________________________________________________
transformer_block (Transform (None, 250, 50)           13682     
_________________________________________________________________
global_average_pooling1d (Gl (None, 50)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                1020      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)               

<tensorflow.python.keras.callbacks.History at 0x7fea471a0080>

In [20]:
pred_soft3 = model3.predict(test_padded)

In [21]:
# pred = np.around(pred_soft, decimals = 0)
pred3 = np.where(pred_soft3 > 0.50, 1, 0)

acc3 = accuracy_score(pred3, test_labels)
f13 = f1_score(pred3, test_labels)

cm3 = confusion_matrix(test_labels, pred3)
tn3 = cm3[0][0]
fn3 = cm3[1][0]
tp3 = cm3[1][1]
fp3 = cm3[0][1]

print('Accuracy score: {:.4f}'.format(acc3), 'F1 score: {:.4f}'.format(f13))
print('False Positives: {:.0f}'.format(fp3), 'False Negatives: {:.0f}'.format(fn3))
print('Confusion matrix:\n', cm3)

Accuracy score: 0.9851 F1 score: 0.8413
False Positives: 2 False Negatives: 38
Confusion matrix:
 [[2536    2]
 [  38  106]]


In [22]:
# model 2: the LSTM model
model2 = Sequential()

# embed_dim = 50
hidden_size = 32
model2.add(Embedding(vocab_size, embed_dim, input_length=maxlen))
model2.add(Bidirectional(LSTM(hidden_size, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)))
model2.add(GlobalMaxPool1D())
model2.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.01)

model2.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', metrics.FalsePositives(), metrics.FalseNegatives()])

model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 50)           7860550   
_________________________________________________________________
bidirectional (Bidirectional (None, 250, 64)           21248     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 7,881,863
Trainable params: 7,881,863
Non-trainable params: 0
_________________________________________________________________


In [23]:
model2.fit(train_padded, train_labels, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fea43c8e6d8>

In [24]:
pred_soft2 = model2.predict(test_padded)

In [25]:
# pred = np.around(pred_soft, decimals = 0)
pred2 = np.where(pred_soft2 > 0.50, 1, 0)

acc2 = accuracy_score(pred2, test_labels)
f12 = f1_score(pred2, test_labels)

cm2 = confusion_matrix(test_labels, pred2)
tn2 = cm2[0][0]
fn2 = cm2[1][0]
tp2 = cm2[1][1]
fp2 = cm2[0][1]

print('Accuracy score: {:.4f}'.format(acc2), 'F1 score: {:.4f}'.format(f12))
print('False Positives: {:.0f}'.format(fp2), 'False Negatives: {:.0f}'.format(fn2))
print('Confusion matrix:\n', cm2)

Accuracy score: 0.9851 F1 score: 0.8485
False Positives: 8 False Negatives: 32
Confusion matrix:
 [[2530    8]
 [  32  112]]


In [26]:
# model 1: BOW + FCNN model
cv = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', dtype=np.int8) 
cv_train_sparse = cv.fit_transform(train_text)
cv_train_dense = sci.sparse.csr_matrix.todense(cv_train_sparse)

cv_test_sparse = cv.transform(test_text)
cv_test_dense = sci.sparse.csr_matrix.todense(cv_test_sparse)

print('BOW for cv_train:', cv_train_dense.shape)
print('BOW for cv_test:', cv_test_dense.shape)

BOW for cv_train: (15198, 150190)
BOW for cv_test: (2682, 150190)


In [27]:
"""
Fully connected NN model with two hidden layers 
"""
model1 = Sequential()
model1.add(Dense(units = 100 , activation = 'relu' , input_dim = cv_train_dense.shape[1]))
model1.add(Dropout(0.1))
model1.add(Dense(units = 10 , activation = 'relu'))
model1.add(Dropout(0.1))
model1.add(Dense(units = 1 , activation = 'sigmoid'))
model1.compile(optimizer = 'adam' , loss = 'binary_crossentropy' , metrics = ['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()])
model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 100)               15019100  
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 10)                1010      
_________________________________________________________________
dropout_5 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 11        
Total params: 15,020,121
Trainable params: 15,020,121
Non-trainable params: 0
_________________________________________________________________


In [28]:
model1.fit(cv_train_dense, train_labels, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fea41ba5710>

In [29]:
pred_soft1 = model1.predict(cv_test_dense)

In [30]:
# pred = np.around(pred_soft, decimals = 0)
pred1 = np.where(pred_soft1 > 0.50, 1, 0)

acc1 = accuracy_score(pred1, test_labels)
f11 = f1_score(pred1, test_labels)

cm1 = confusion_matrix(test_labels, pred1)
tn1 = cm1[0][0]
fn1 = cm1[1][0]
tp1 = cm1[1][1]
fp1 = cm1[0][1]

print('Accuracy score: {:.4f}'.format(acc1), 'F1 score: {:.4f}'.format(f11))
print('False Positives: {:.0f}'.format(fp1), 'False Negatives: {:.0f}'.format(fn1))
print('Confusion matrix:\n', cm1)

Accuracy score: 0.9862 F1 score: 0.8538
False Positives: 1 False Negatives: 36
Confusion matrix:
 [[2537    1]
 [  36  108]]


In [53]:
# Averaging ensemble prediction
pred_softa=(pred_soft1 + pred_soft2 + pred_soft3)/3

# Set probability to declare post as fraudulent
preda = np.where(pred_softa > 0.15, 1, 0)

acca = accuracy_score(preda, test_labels)
f1a = f1_score(preda, test_labels)

cma = confusion_matrix(test_labels, preda)
tna = cmf[0][0]
fna = cmf[1][0]
tpa = cmf[1][1]
fpa = cmf[0][1]

print('Accuracy score: {:.4f}'.format(acca), 'F1 score: {:.4f}'.format(f1a))
print('False Positives: {:.0f}'.format(fpa), 'False Negatives: {:.0f}'.format(fna))
print('Confusion matrix:\n', cma)

Accuracy score: 0.9881 F1 score: 0.8841
False Positives: 10 False Negatives: 22
Confusion matrix:
 [[2528   10]
 [  22  122]]


In [50]:
# stacking ensemble

# get prediction scores for train samples
train_soft1 = model1.predict(cv_train_dense)
train_soft2 = model2.predict(train_padded)
train_soft3 = model3.predict(train_padded)


In [51]:
stack_train = np.hstack([train_soft1, train_soft2, train_soft3])

stack_test = np.hstack([pred_soft1, pred_soft2, pred_soft3])

model_stack = LogisticRegression()
model_stack.fit(stack_train, train_labels)

pred_softs = model_stack.predict(stack_test)

In [52]:
# Stacking prediction

# Set probability to declare post as fraudulent
preds = np.where(pred_softs > 0.01, 1, 0)

accs = accuracy_score(preds, test_labels)
f1s = f1_score(preds, test_labels)

cms = confusion_matrix(test_labels, preds)
tns = cms[0][0]
fns = cms[1][0]
tps = cms[1][1]
fps = cms[0][1]

print('Accuracy score: {:.4f}'.format(accs), 'F1 score: {:.4f}'.format(f1s))
print('False Positives: {:.0f}'.format(fps), 'False Negatives: {:.0f}'.format(fns))
print('Confusion matrix:\n', cms)

Accuracy score: 0.9855 F1 score: 0.8446
False Positives: 1 False Negatives: 38
Confusion matrix:
 [[2537    1]
 [  38  106]]
