In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('fake_job_postings.csv')

In [3]:
df.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [5]:
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [6]:
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [7]:
df['department'] = df['department'].fillna(df['department'].mode()[0])

In [8]:
data_cat_imp=SimpleImputer(strategy="constant",fill_value="Missing")
cat_imp_feature=["title","location","department","salary_range","company_profile","description","requirements","benefits",
                 "employment_type","required_experience","required_education","industry","function"]

# Filling the Numerical values through existing value
data_num_imp=SimpleImputer(strategy="constant",fill_value=None)
num_imp_feature =["job_id","telecommuting","has_company_logo","has_questions","fraudulent"]

# Transforming into column
data_imp_trans=ColumnTransformer([("data_cat_imp",data_cat_imp,cat_imp_feature),
                                 ("data_num_imp",data_num_imp,num_imp_feature)])

# Transforming and assigning the data
transformed_data=data_imp_trans.fit_transform(df)

In [9]:
transformed_data

array([['Marketing Intern', 'US, NY, New York', 'Marketing', ..., 1, 0,
        0],
       ['Customer Service - Cloud Video Production', 'NZ, , Auckland',
        'Success', ..., 1, 0, 0],
       ['Commissioning Machinery Assistant (CMA)', 'US, IA, Wever',
        'Sales', ..., 1, 0, 0],
       ...,
       ['Project Cost Control Staff Engineer - Cost Control Exp - TX',
        'US, TX, Houston', 'Sales', ..., 0, 0, 0],
       ['Graphic Designer', 'NG, LA, Lagos', 'Sales', ..., 0, 1, 0],
       ['Web Application Developers', 'NZ, N, Wellington', 'Engineering',
        ..., 1, 1, 0]], dtype=object)

In [10]:
df_transformed=pd.DataFrame(transformed_data,
                         columns=["title","location","department","salary_range","company_profile","description",
                                  "requirements","benefits", "employment_type","required_experience","required_education",
                                  "industry","function","job_id","telecommuting","has_company_logo","has_questions",
                                  "fraudulent"])

In [11]:
df_transformed.head(2)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,Missing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Missing,Other,Internship,Missing,Missing,Marketing,1,0,1,0,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Missing,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,Missing,Marketing and Advertising,Customer Service,2,0,1,0,0


In [12]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17880 non-null  object
 2   department           17880 non-null  object
 3   salary_range         17880 non-null  object
 4   company_profile      17880 non-null  object
 5   description          17880 non-null  object
 6   requirements         17880 non-null  object
 7   benefits             17880 non-null  object
 8   employment_type      17880 non-null  object
 9   required_experience  17880 non-null  object
 10  required_education   17880 non-null  object
 11  industry             17880 non-null  object
 12  function             17880 non-null  object
 13  job_id               17880 non-null  object
 14  telecommuting        17880 non-null  object
 15  has_company_logo     17880 non-null  object
 16  has_

In [21]:
X = df_transformed.drop('fraudulent', axis=1)
y = df_transformed['fraudulent']
y = y.astype('int')

In [14]:
X.head(2)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions
0,Marketing Intern,"US, NY, New York",Marketing,Missing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Missing,Other,Internship,Missing,Missing,Marketing,1,0,1,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Missing,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,Missing,Marketing and Advertising,Customer Service,2,0,1,0


In [22]:
X = pd.get_dummies(X[cat_imp_feature], drop_first=True)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [26]:
rf = RandomForestClassifier()

In [27]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [28]:
pred = rf.predict(X_test)

In [31]:
print("Fake Job RF Accuracy:", accuracy_score(y_test, pred)*100)


Fake Job RF Accuracy: 97.8523489932886


In [30]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n           0       0.98      1.00      0.99      4275\n           1       1.00      0.51      0.67       195\n\n    accuracy                           0.98      4470\n   macro avg       0.99      0.75      0.83      4470\nweighted avg       0.98      0.98      0.98      4470\n'