**Random Forest CLassifier Model Implementation on the FakeVsReal Job postings Dataset**

Import libraries

In [0]:
#Importing all the required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score , classification_report, mean_squared_error, r2_score,accuracy_score,confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [0]:
#Importing the dataset
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/fake_job_postings.csv")

Data Exploration

In [3]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [5]:
df.dtypes.value_counts()

object    13
int64      5
dtype: int64

In [6]:
# Statistical Description
df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [7]:
#Generating the correlation matrix
corr_matrix = df.corr()
corr_matrix['job_id'].sort_values(ascending=False)

job_id              1.000000
fraudulent          0.079872
telecommuting      -0.004559
has_company_logo   -0.014539
has_questions      -0.087025
Name: job_id, dtype: float64

Handling missing values and NaNs

In [0]:
#Dropping all NaN values and unwanted column JOB_ID
df = df.dropna()
df = df.drop('job_id',axis=1)

In [0]:
#Dropping Text columns which can't be converted 
df = df.drop(['company_profile','description','benefits','requirements','salary_range'],axis=1)

In [10]:
#Making categorical variable to numbers
cat_var = ['title','location','department','employment_type','required_experience','required_education','industry','function']

le = LabelEncoder()
df[cat_var] = df[cat_var].apply(lambda x: le.fit_transform(x))

df[cat_var]

Unnamed: 0,title,location,department,employment_type,required_experience,required_education,industry,function
6,255,10,12,1,5,4,50,20
15,596,91,193,1,3,1,24,31
23,599,110,37,1,3,7,38,31
98,269,100,155,1,5,3,49,23
102,346,43,147,1,2,1,38,22
...,...,...,...,...,...,...,...,...
17734,138,197,66,1,2,3,55,7
17759,138,167,66,1,2,3,55,7
17813,138,178,66,1,2,3,55,7
17849,100,120,80,1,5,1,12,9


In [0]:
#Putting feature variable to X (target value)
X = df.drop('fraudulent',axis=1)

# Putting response variable to y (Fraudulent column is our target column)
y = df['fraudulent']

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [16]:
rfc = RandomForestClassifier()  
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
predictions = rfc.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       207
           1       1.00      0.65      0.79        26

    accuracy                           0.96       233
   macro avg       0.98      0.83      0.88       233
weighted avg       0.96      0.96      0.96       233



In [18]:
print(confusion_matrix(y_test,predictions))

[[207   0]
 [  9  17]]


In [19]:
print(accuracy_score(y_test,predictions))

0.9613733905579399
