In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
df = pd.read_csv('../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')
df.head()

In [1]:
df.info()

* Not fraudulent = 0
* Fraudulent = 1

In [1]:
df.dropna(subset=['description'], inplace=True)

In [1]:
df.info()

In [1]:
sentences = df['description']
labels = df['fraudulent']

In [1]:
sentences

In [1]:
labels

In [1]:
labels.value_counts().plot(kind='bar', title='Value Counts - Fraudulent')

**TensorFlow**

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [1]:
vocab_size = 3000
embedding_dim = 16
max_length = 150
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_size = 11000

In [1]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [1]:
print(len(testing_sentences))

In [1]:
training_sentences

In [1]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

In [1]:
#print(word_index)

In [1]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [1]:
print(training_sentences[0])
print(training_sequences[0])

In [1]:
training_padded[0]

In [1]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                             input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(),
             metrics=['accuracy'])

In [1]:
model.summary()

In [1]:
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0.01, patience=1)

In [1]:
history = model.fit(training_padded, training_labels, epochs=30, 
                   validation_data=(testing_padded, testing_labels), callbacks=early_stop, verbose=2)

In [1]:
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

In [1]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(accuracy, label='accuracy')
plt.plot(val_accuracy, label='val_accuracy')
plt.plot(loss, label='loss')
plt.plot(val_loss, label='val_loss')
plt.grid(True)
plt.legend()
plt.show()