# Fake Job Postings

In [1]:
import os
import numpy as np
import pandas as pd


# Paths and Variables

In [2]:
dataset_name = 'fake_job_postings'

In [3]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

In [4]:
inp_fname = 'fake_job_postings.csv'

# Read data into a DataFrame

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
print(data.shape)
data.head()

(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


# Prepare Data

In [6]:
id_col = "job_id"
# text will be concatenation of company_profile, description, and requirements fields in original data. 
text_col = "company_profile_description_requirements"
target_col = "fraudulent"

In [7]:
# We will fill nan values with custom strings
data['company_profile'] = data['company_profile'].fillna("company profile not available")
data['description'] = data['description'].fillna("description not available")
data['requirements'] = data['requirements'].fillna("requirements not available")

In [8]:
data[target_col].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [9]:
# concatenate the 3 fields. 
data[text_col] = data.apply( lambda row:  str(row['company_profile']) + " " + str(row['description'])
                             + " " + str(row['requirements']) , axis = 1)

In [10]:
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,company_profile_description_requirements
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"We're Food52, and we've created a groundbreaki..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,"90 Seconds, the worlds Cloud Video Production ..."
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,Valor Services provides Workforce Solutions th...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,Our passion for improving quality of life thro...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,SpotSource Solutions LLC is a Global Human Cap...


# Shuffle Data

In [11]:
# shuffle data
data = data.sample(frac=1, random_state=42)
print(data.shape)
data.head()

(17880, 19)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,company_profile_description_requirements
4708,4709,Python Engineer,"GB, , London",,,company profile not available,Stylect is a dynamic startup that helps helps ...,We don’t care where you studied or what your G...,We are negotiable on salary and there is the p...,0,1,0,Full-time,Entry level,Unspecified,Apparel & Fashion,Information Technology,0,company profile not available Stylect is a dyn...
11079,11080,Entry Level Sales,"US, OH, Cincinnati",,55000-75000,company profile not available,General Summary: Achieves maximum sales profit...,requirements not available,Great Health and DentalFast Advancement Opport...,1,0,0,Full-time,Entry level,High School or equivalent,Financial Services,Sales,0,company profile not available General Summary:...
12357,12358,Agile Project Manager,"US, NY, New York",,,ustwo offers you the opportunity to be yoursel...,"At ustwo™ you get to be yourself, whilst deliv...",Skills• Experience interfacing directly with c...,,0,1,0,,,,,,0,ustwo offers you the opportunity to be yoursel...
14511,14512,Marketing Coordinator,"GB, GBN, London",Business:Marketing,,"We build software for fashion retailers, to he...",About EDITDEDITD runs the world's biggest appa...,Required Skills / Experience:Ability to analys...,,0,1,0,Full-time,,,,Marketing,0,"We build software for fashion retailers, to he..."
16691,16692,Full-stack Web Engineer,"US, CA, San Francisco",,,Runscope is building tools for developers work...,As a Web Engineer at Runscope you'll be respon...,Extensive front-end web experience (HTML/CSS/J...,Be a part of an experienced team who have work...,0,1,1,Full-time,Mid-Senior level,,,Engineering,0,Runscope is building tools for developers work...


# Insert Id Column

In [12]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

In [13]:
data = data[[id_col, target_col, text_col]]
data.head()

Unnamed: 0,job_id,fraudulent,company_profile_description_requirements
4708,4709,0,company profile not available Stylect is a dyn...
11079,11080,0,company profile not available General Summary:...
12357,12358,0,ustwo offers you the opportunity to be yoursel...
14511,14512,0,"We build software for fashion retailers, to he..."
16691,16692,0,Runscope is building tools for developers work...


In [14]:
data[id_col].nunique(), data[target_col].nunique(), data[text_col].nunique()

(17880, 2, 15350)

# Utility to Save DF as a zipped file

In [15]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [16]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [17]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)


(16092, 3) (1788, 2) (1788, 2)


In [18]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")