Read the data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('fake_job_postings.csv')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [None]:
df.shape

(17880, 18)

Take the relevant columns 

In [None]:
new_df=df[['description','fraudulent']]
new_df.head()

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0


In [None]:
new_df=new_df.dropna()
new_df.shape

(17879, 2)

In [None]:
new_df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0
17878,Nemsia Studios is looking for an experienced v...,0


Before data balancing

In [None]:
y=new_df.fraudulent
X=new_df.drop('fraudulent',axis=1)
y.value_counts()

0    17014
1      865
Name: fraudulent, dtype: int64

Retrieve rows with fake job posting

In [None]:
pos_df = new_df.loc[df['fraudulent'] == 1]
pos_df

Unnamed: 0,description,fraudulent
98,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1
144,The group has raised a fund for the purchase o...,1
173,Technician Instrument &amp; ControlsLocation D...,1
180,Sales Executive,1
215,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1
...,...,...
17827,Student Positions Part-Time and Full-Time.You ...,1
17828,LEARN TO EARN AN EXECUTIVE LEVEL INCOMEFULL TR...,1
17829,inFullMobile Sp. z o.o. is a mobile software d...,1
17830,JOB DESCRIPTIONWe are seeking a full time payr...,1


Install the necessary library for data augmentation and its necessary components from nltk

In [None]:
pip install nlpaug



In [None]:
# Download the necessary components
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Each row from the positive class is read and generated 19 sentences for each text in each row in description column then added the generated sentences to the original data

In [None]:
import nlpaug
import nlpaug.augmenter.word as naw

# aug_max is for maximum number of words we want to replace with their corresponding synonyms
aug = naw.SynonymAug(aug_src='wordnet',aug_max=20)

for index,row in pos_df.iterrows():
  row['description'] = str(row['description'])
  # n is for how many generated sentences we want
  aug_des = aug.augment(row['description'],n=19)
  for i in aug_des:
    new_df = new_df.append({'description':i,'fraudulent':row['fraudulent']}, ignore_index=True)

After data balancing

In [None]:
y=new_df.fraudulent
X=new_df.drop('fraudulent',axis=1)
y.value_counts()

1    17300
0    17014
Name: fraudulent, dtype: int64

Model creation

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
voc_size=5000

In [None]:
# One-hot Representation

message = X.copy()
message.reset_index(inplace=True)

In [None]:
import nltk
import re
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Data pre-processing

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(message)):
    review = re.sub('[^a-zA-Z]', ' ', message['description'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[1]

'organis focus vibrant awesom passion custom servic slick type skill mayb account manag think administr cooler polar bear jetski need hear cloud video product servic opper glodal level yeah pretti cool seriou deliv world class product excel custom servic rapidli expand busi look talent project manag manag success deliveri video project manag client commun drive product process work coolest brand planet learn global team repres nz huge way enter next growth stage busi grow quickli intern therefor posit burst opportun right person enter busi right time second world cloud video product servic http url fbe afac cd c f b eef e e f ca dd second world cloud video product servic enabl brand agenc get high qualiti onlin video content shot produc anywher world fast afford manag seamlessli cloud purchas publish second remov hassl cost risk speed issu work regular video product compani manag everi aspect video project beauti onlin experi grow network rate video profession countri dedic product suc

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[1]

[3726,
 4528,
 1775,
 4373,
 31,
 2194,
 1110,
 658,
 2897,
 3922,
 711,
 4142,
 161,
 1427,
 3978,
 3718,
 3800,
 678,
 2858,
 2012,
 917,
 1231,
 3138,
 3760,
 1110,
 4128,
 1162,
 1525,
 2756,
 196,
 2618,
 4700,
 2515,
 2630,
 4126,
 3760,
 3296,
 2194,
 1110,
 1802,
 2572,
 2968,
 304,
 999,
 2545,
 161,
 161,
 1765,
 4234,
 3138,
 2545,
 161,
 2937,
 773,
 1204,
 3760,
 2255,
 4548,
 1868,
 641,
 1070,
 2640,
 1820,
 2507,
 4914,
 1329,
 3773,
 43,
 499,
 4629,
 4560,
 2839,
 2968,
 3730,
 2059,
 3164,
 4857,
 4016,
 3379,
 4418,
 2357,
 619,
 499,
 2968,
 2357,
 3774,
 171,
 2630,
 1231,
 3138,
 3760,
 1110,
 640,
 2328,
 3177,
 3873,
 2031,
 3925,
 1608,
 1354,
 4542,
 876,
 876,
 1608,
 440,
 3056,
 171,
 2630,
 1231,
 3138,
 3760,
 1110,
 1876,
 641,
 1477,
 2336,
 2280,
 1411,
 4877,
 3138,
 2008,
 2632,
 4350,
 4152,
 2630,
 1772,
 4809,
 161,
 977,
 1231,
 560,
 4567,
 171,
 1730,
 2306,
 2241,
 1571,
 2853,
 4822,
 4548,
 2964,
 3138,
 3760,
 4438,
 161,
 1679,
 4456,
 31

In [None]:
# Embedding Representation

sent_length=40
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[3076 2008  161 ...  498  141 4750]
 [1765 1788 1405 ... 3842 3337 4455]
 [   0    0    0 ... 2074 2118 1459]
 ...
 [   0    0    0 ...  968 3451 2847]
 [   0    0    0 ...  968 3451 2893]
 [   0    0    0 ... 1756  968 3451]]


In [None]:
embedded_docs[0]

array([3076, 2008,  161,  338, 4354,  990, 4877,  665,  977, 2988, 2341,
       2341,  977, 3417, 4769, 2909,  977,  524,  977,  106, 3615, 2866,
        617,   58, 2249, 3978, 4548, 2251, 1528, 2936, 1015, 1098, 4424,
        779,  624,   45, 2154,  498,  141, 4750], dtype=int32)

In [None]:
# Create the model

embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Bidirectional(GRU(512))) #GRU can be changed to LSTM with the same parameter value
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 40, 50)            250000    
                                                                 
 conv1d_11 (Conv1D)          (None, 40, 64)            16064     
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, 13, 64)           0         
 g1D)                                                            
                                                                 
 bidirectional_8 (Bidirectio  (None, 1024)             1775616   
 nal)                                                            
                                                                 
 dropout_11 (Dropout)        (None, 1024)              0         
                                                                 
 dense_11 (Dense)            (None, 1)               

In [None]:
len(embedded_docs),y.shape

(34314, (34314,))

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)
X_final[1]

array([1765, 1788, 1405, 1019, 4165, 1547, 1873,  171, 4350,  967, 3138,
       4419, 1820,  641, 3886, 2630,  952, 3886,  485, 1718, 2673, 4411,
       1859, 2249, 1994, 3239,  674, 3633, 2866, 3501, 1994, 2249, 2586,
       2357,  577, 4239, 1485, 3842, 3337, 4455], dtype=int32)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8f2db85b10>

In [None]:
# Model Performance and Accuracy

y_pred = model.predict(X_test)
y_pred = y_pred > 0.5

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

print('Accuracy : ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Conf_Matrix: ', confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

Accuracy :  0.9922774296954685
Precision:  0.996228604583696
Recall:  0.9884858952216465
F1 Score:  0.992342147088571
Conf_Matrix:  [[3376   13]
 [  40 3434]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3389
           1       1.00      0.99      0.99      3474

    accuracy                           0.99      6863
   macro avg       0.99      0.99      0.99      6863
weighted avg       0.99      0.99      0.99      6863

