<a href="https://colab.research.google.com/github/r-dube/fakejobs/blob/main/fj_bow_logistic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Load the modules used
import numpy as np
import scipy as sci
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [17]:
# For reproducible results
# except for variability introduced by GPU
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '42'
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # avoid using GPU for reproducible results
np.random.seed(42)
rn.seed(42)
# tf.random.set_seed(42)

In [18]:
# Set data_url, the location of the data
# Data is not loaded from a local file
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_small.csv"
# data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fj_medium.csv"
data_url="https://raw.githubusercontent.com/r-dube/fakejobs/main/data/fake_job_postings.csv"

In [19]:
def fj_load_df_from_url():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        dataframe
    """

    df = pd.read_csv(data_url)

    print ('Loaded dataframe shape', df.shape)

    counts = fj_label_stats(df)
    print ('Not fraudulent', counts[0], 'Fraudulent', counts[1])

    print(df.describe())

    print ('NAs/NANs in data =>')
    print(df.isna().sum())

    return df

def fj_label_stats(df):
    """
    Very basic label statistics
    Input: 
        Dataframe
    Returns:
        Number of samples with 0, 1 as the label
    """
    counts = np.bincount(df['fraudulent'])
    return counts

def fj_txt_only(df):
    """
    Combine all the text fields, discard everything else except for the label
    Input: 
        Dataframe
    Returns:
        Processed dataframe
    """
    
    df.fillna(" ", inplace = True)

    df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + \
    ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + \
    df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] + \
    ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function'] 

    del df['title']
    del df['location']
    del df['department']
    del df['company_profile']
    del df['description']
    del df['requirements']
    del df['benefits']
    del df['employment_type']
    del df['required_experience']
    del df['required_education']
    del df['industry']
    del df['function']  
    
    del df['salary_range']
    del df['job_id']
    del df['telecommuting']
    del df['has_company_logo']
    del df['has_questions']

    return df

In [20]:
df = fj_load_df_from_url()
df = fj_txt_only(df)
print('Maximum text length', df['text'].str.len().max())

Loaded dataframe shape (17880, 18)
Not fraudulent 17014 Fraudulent 866
             job_id  telecommuting  ...  has_questions    fraudulent
count  17880.000000   17880.000000  ...   17880.000000  17880.000000
mean    8940.500000       0.042897  ...       0.491723      0.048434
std     5161.655742       0.202631  ...       0.499945      0.214688
min        1.000000       0.000000  ...       0.000000      0.000000
25%     4470.750000       0.000000  ...       0.000000      0.000000
50%     8940.500000       0.000000  ...       0.000000      0.000000
75%    13410.250000       0.000000  ...       1.000000      0.000000
max    17880.000000       1.000000  ...       1.000000      1.000000

[8 rows x 5 columns]
NAs/NANs in data =>
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telec

In [21]:
# train-test split
train_text, test_text, train_labels , test_labels = train_test_split(df['text'], df['fraudulent'] , test_size = 0.15)

In [22]:
# model 1: BOW + logistic model
# dense representation is not imposed by logistic regression
# sparse representation is needed so that logistic regression does not run out of memory
cv = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', dtype=np.int8, binary=True) 
cv_train_sparse = cv.fit_transform(train_text)
# cv_train_dense = sci.sparse.csr_matrix.todense(cv_train_sparse)

cv_test_sparse = cv.transform(test_text)
# cv_test_dense = sci.sparse.csr_matrix.todense(cv_test_sparse)
# print('BOW for cv_train:', cv_train_dense.shape)
# print('BOW for cv_test:', cv_test_dense.shape)

In [23]:
type(cv_test_sparse)

scipy.sparse.csr.csr_matrix

In [24]:
MAX_ITER = 100
model1 = LogisticRegression(max_iter=MAX_ITER)
model1.fit(cv_train_sparse, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
lr_probs = model1.predict_proba(cv_test_sparse)
pred_soft1 = lr_probs[:, 1]

In [30]:
print(pred_soft1)

[2.22816095e-05 3.75057717e-03 4.88709043e-05 ... 6.81889461e-04
 3.76782525e-05 2.59394883e-03]


In [32]:
# pred = np.around(pred_soft, decimals = 0)
pred1 = np.where(pred_soft1 > 0.15, 1, 0)

acc1 = accuracy_score(pred1, test_labels)
f11 = f1_score(pred1, test_labels)

cm1 = confusion_matrix(test_labels, pred1)
tn1 = cm1[0][0]
fn1 = cm1[1][0]
tp1 = cm1[1][1]
fp1 = cm1[0][1]

print('Accuracy score: {:.4f}'.format(acc1), 'F1 score: {:.4f}'.format(f11))
print('False Positives: {:.0f}'.format(fp1), 'False Negatives: {:.0f}'.format(fn1))
print('Confusion matrix:\n', cm1)

Accuracy score: 0.9858 F1 score: 0.8652
False Positives: 16 False Negatives: 22
Confusion matrix:
 [[2522   16]
 [  22  122]]


In [33]:
# sanity check the model parameters
print (type(model1.coef_), model1.coef_.shape[0], model1.coef_.shape[1])
print (model1.coef_, model1.intercept_)

<class 'numpy.ndarray'> 1 96150
[[-2.98880470e-01  7.79473753e-01 -6.27442703e-03 ... -1.83999169e-05
  -1.83999169e-05 -1.83999169e-05]] [-1.55240758]


In [13]:
# print the top-k words with the largest coefficients
# these words contribute the most to a job description declared fraudulent
coef = model1.coef_.reshape(model1.coef_.shape[1])
k = 20
ind = np.argpartition(coef, -k)[-k:]
for i in range (k):
  print(ind[i], coef[ind[i]], list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(ind[i])])

8876 0.6258794722749936 balance
73348 0.6333128423457315 send
25611 0.6259285387198736 duration
38004 0.6434919556502703 hospital
90433 0.75822867725062 wages
51049 0.7556025983045982 money
54480 0.7290765727550464 oil
26515 0.7340161687538614 egovernment
939 0.6560803983381377 28
6562 0.7140744759683485 aptitude
90601 0.6543170280066289 warsaw
54398 0.7694429585002532 offshore
2587 1.0494498113942459 accounting
83983 0.8756074645931318 timejob
1 0.7794737530953701 000
45978 1.4066493742729074 link
70901 1.2020262877494345 rohan
3537 0.9737890429754346 administrative
8060 0.8073265859315331 au
25832 1.0480087026327678 earn


In [34]:
"""
# Uncomment to save results on drive to a csv file
df_results1 = pd.DataFrame(data=test_labels)
df_results1.reset_index(drop=True, inplace=True)
df_results2 = pd.DataFrame(data=pred_soft1, columns=['logistic'])
df_results = pd.concat([df_results1, df_results2], axis=1)

from google.colab import drive
drive.mount('/content/drive')
results_file='/content/drive/My Drive/Results/logistic.csv'

df_results.to_csv(results_file)
"""

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
