<a href="https://colab.research.google.com/github/r-dube/fakejobs/blob/main/fj_fcnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the modules used
import numpy as np
import scipy as sci
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
# For reproducible results
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '42'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(42)
rn.seed(42)
tf.random.set_seed(42)

In [None]:
# Set data_url, the location of the data
# Data is not loaded from a local file
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_small.csv"
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_medium.csv"
data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fake_job_postings.csv"

In [None]:
def fj_load_df_from_url():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        dataframe
    """

    df = pd.read_csv(data_url)

    print ('Loaded dataframe shape', df.shape)

    counts = fj_label_stats(df)
    print ('Not fraudulent', counts[0], 'Fraudulent', counts[1])

    print(df.describe())

    print ('NAs/NANs in data =>')
    print(df.isna().sum())

    return df

def fj_label_stats(df):
    """
    Very basic label statistics
    Input: 
        Dataframe
    Returns:
        Number of samples with 0, 1 as the label
    """
    counts = np.bincount(df['fraudulent'])
    return counts

def fj_txt_only(df):
    """
    Combine all the text fields, discard everything else except for the label
    Input: 
        Dataframe
    Returns:
        Processed dataframe
    """
    
    df.fillna(" ", inplace = True)

    df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + \
    ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + \
    df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] + \
    ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function'] 

    del df['title']
    del df['location']
    del df['department']
    del df['company_profile']
    del df['description']
    del df['requirements']
    del df['benefits']
    del df['employment_type']
    del df['required_experience']
    del df['required_education']
    del df['industry']
    del df['function']  
    
    del df['salary_range']
    del df['job_id']
    del df['telecommuting']
    del df['has_company_logo']
    del df['has_questions']

    return df

In [None]:
df = fj_load_df_from_url()
df = fj_txt_only(df)

In [None]:
"""
Use Count Vectorizer to convert text into bag of words
"""
train_text, test_text, train_category , test_category = train_test_split(df['text'], df['fraudulent'] , test_size = 0.15)

cv = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', dtype=np.int8) 
cv_train_sparse = cv.fit_transform(train_text)
cv_train_dense = sci.sparse.csr_matrix.todense(cv_train_sparse)

cv_test_sparse = cv.transform(test_text)
cv_test_dense = sci.sparse.csr_matrix.todense(cv_test_sparse)

print('BOW for cv_train:', cv_train_dense.shape)
print('BOW for cv_test:', cv_test_dense.shape)

In [None]:
"""
Fully connected NN model with two hidden layers 
"""
model = Sequential()
model.add(Dense(units = 100 , activation = 'relu' , input_dim = cv_train_dense.shape[1]))
model.add(Dropout(0.1))
model.add(Dense(units = 10 , activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(units = 1 , activation = 'sigmoid'))
model.compile(optimizer = 'adam' , loss = 'binary_crossentropy' , metrics = ['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()])
model.summary()

In [None]:
model.fit(cv_train_dense, train_category, epochs = 5)

In [None]:
pred = model.predict(cv_test_dense)
pred = np.around(pred, decimals = 0)

acc = accuracy_score(pred, test_category)
f1 = f1_score(pred, test_category)
print('Accuracy score: {:.4f}'.format(acc), 'F1 score: {:.4f}'.format(f1))