# Interesting Problem
#### -How to predict whether a job posting is Fraudulent
#### -Which classifier (RandomForest, Logistic Regression or KNeighbours) is best for predicting if a job posting is fraudulent

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

import os
for dirname, _, filenames in os.walk('../input/real-or-fake-fake-jobposting-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#importing the csv data into jobData
jobData = pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')

jobData.head()

In [None]:
#info about the variables
jobData.info()

In [None]:
#Finding out the num of rows and columns
print("Data dims is: ",jobData.shape)

In [None]:
#check for duplicate datas
jobData.duplicated().sum()

In [None]:
#Check for null values
jobData.isnull().sum()

In [None]:
#replace null values with "NULL"
jobData.fillna("NULL",inplace=True)

#checking for any null values
jobData.isnull().sum()

In [None]:
# num of fake jobs in the Dataset
print("Number of real (label as 0) and fake jobs (label as 1) in the dataset :")


print(jobData["fraudulent"].value_counts())
sb.catplot(x="fraudulent", data = jobData, kind = "count")

In [None]:
# unique elements in different variables in the Dataset
print("NUMBERS OF UNIQUE ELEMENTS IN EACH VARIABLES:")
print("title:", len(jobData["title"].unique()))
print("location:", len(jobData["location"].unique()))
print("department:", len(jobData["department"].unique()))
print("salary range:", len(jobData["salary_range"].unique()))
print("company profile:", len(jobData["company_profile"].unique()))
print("description:", len(jobData["description"].unique()))
print("requirements:", len(jobData["requirements"].unique()))
print("benefits:", len(jobData["benefits"].unique()))
print("telecommuting:", len(jobData["telecommuting"].unique()))
print("has company logo:", len(jobData["has_company_logo"].unique()))
print("has questions:", len(jobData["has_questions"].unique()))
print("employment type:", len(jobData["employment_type"].unique()))
print("required experience:", len(jobData["required_experience"].unique()))
print("required education:", len(jobData["required_education"].unique()))
print("industry:", len(jobData["industry"].unique()))
print("function:", len(jobData["function"].unique()))

# Analysis of Data


We decided to not use the variables title, location, department as there are over 1000 category in them making it non ideal and hard to work with

-------------------------------------------------------------------------------

## Salary Range

In [None]:
#Extracting the data of the variables salary_range and fraudulent
salaryrange = pd.DataFrame(jobData[['salary_range','fraudulent']])

Since there is 875 unique elements in the variable, we would like to know if this variable will be useful to us by seeing how many null and non - null values there are

In [None]:
#if there is a value in the variable salary_range replace it with "NOT NULL"
salaryrange.loc[(salaryrange.salary_range !='NULL'),'salary_range']='NOT NULL'

#print the num of Null and Non Null 
print(salaryrange["salary_range"].value_counts())


In [None]:
plt.figure(figsize=(15,15))

#Plotting a countplot
ax=sb.countplot(x ="salary_range",hue="fraudulent", data= salaryrange)

#Print title on top of the countplot fig
ax.set_title("Number of null and non-null data in the variable salary range ", fontsize = 20)

#changing the font size of the y axis label, 'count'
plt.ylabel("count", fontsize=20)

#To Display the count values on top of the countplot
for p in ax.patches:
  ax.annotate(f'\n{p.get_height()}', (p.get_x()+.2, p.get_height()), ha='center', va='bottom', color='black', size=24)


plt.show()

Since the number of NULL value is 15012 and NON NULL value is 2868, the variable salary_range will not be useful to us

-------------------------------------------------------------------------------


For the variable company profile, description, requirements and benefits. We would like to compare the num of characters between the real and the fake job postings

## Company Profile

In [None]:

fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)

#counting the number of characters for frauduelent data
length=jobData[jobData["fraudulent"]==1]['company_profile'].str.len()

#plotting the histogram
ax1.hist(length,bins = 20,color='orangered')
ax1.set_title('Fake Post')

#counting the number of characters for non frauduelent data
length=jobData[jobData["fraudulent"]==0]['company_profile'].str.len()

#plotting the histogram
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Characters in Company Profile')
plt.show()

we can see that fake postings in the variable company profile has lesser characaters and that if there is more that 1500 characters the post is likely to be a real post.

## Description

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)

#counting the number of characters for frauduelent data
length=jobData[jobData["fraudulent"]==1]['description'].str.len()

#plotting the histogram
ax1.hist(length,bins = 20,color='orangered')
ax1.set_title('Fake Post')

#counting the number of characters for non frauduelent data
length=jobData[jobData["fraudulent"]==0]['description'].str.len()

#plotting the histogram
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Characters in Description')


plt.show()

The distribution of characters in both the Fake and Real job postings are similar but some fake post can reach up to 6000 to 6500 character

## Requirements

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)

#counting the number of characters for frauduelent data
length=jobData[jobData["fraudulent"]==1]['requirements'].str.len()

#plotting the histogram
ax1.hist(length,bins = 20,color='orangered')
ax1.set_title('Fake Post')

#counting the number of characters for non frauduelent data
length=jobData[jobData["fraudulent"]==0]['requirements'].str.len()

#plotting the histogram
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Characters in Requirements')
plt.show()

The distribution for both real and fake job postings is similar

## Benefits

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)

#counting the number of characters for frauduelent data
length=jobData[jobData["fraudulent"]==1]['benefits'].str.len()

#plotting the histogram
ax1.hist(length,bins = 20,color='orangered')
ax1.set_title('Fake Post')

#counting the number of characters for Non frauduelent data
length=jobData[jobData["fraudulent"]==0]['benefits'].str.len()

#plotting the histogram
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Characters in Benefits')
plt.show()

-------------------------------------------------------------------------------


For the variables telecommuting, has company logo, has questions, employment type, required experience and required education we would like to know the ratio of real vs fake job postings for each unique element.

## Telecommuting

In [None]:
#Plotting the countplot
plt.figure(figsize=(15,15))
ax=sb.countplot(x ="telecommuting", hue="fraudulent", data=jobData)
ax.set_title("Number of Real and Fake job posts in the variable telecommuting ", fontsize = 20)

#setting the title and fontsize of the x and y axis label of the count plot
plt.xlabel("Telecommuting",fontsize=20)
plt.ylabel("count", fontsize=20)

#To Display the count values on top of the countplot
for p in ax.patches:
  ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='bottom', color='black', size=24)


plt.show()

## Has Company Logo

In [None]:
#Plotting the countplot
plt.figure(figsize=(15,15))
ax=sb.countplot(x ="has_company_logo", hue="fraudulent", data=jobData)

#setting the title and fontsize of the x and y axis label of the count plot
ax.set_title("Number of Real and Fake job posts in the variable has company logo ", fontsize = 20)
plt.xlabel("has company logo",fontsize=20)
plt.ylabel("count", fontsize=20)

#To Display the count values on top of the countplot
for p in ax.patches:
  ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='bottom', color='black', size=24)


plt.show()

## Has Questions

In [None]:
#Plotting the countplot
plt.figure(figsize=(15,15))
ax=sb.countplot(x ="has_questions", hue="fraudulent", data=jobData)

#setting the title and fontsize of the x and y axis label of the count plot
ax.set_title("Number of Real and Fake job posts in the variable has questions ", fontsize = 20)
plt.xlabel("has questions",fontsize=20)
plt.ylabel("count", fontsize=20)

#To Display the count values on top of the countplot
for p in ax.patches:
  ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='bottom', color='black', size=24)


plt.show()

From the variable telecommuting, has company logo and has questions we can observe that there is more category '0' in these variables for fake job postings

## Employment Type

In [None]:
#Plotting the countplot
plt.figure(figsize=(15,15))
ax=sb.countplot(x ="employment_type", hue="fraudulent", data=jobData)

#setting the title and fontsize of the x and y axis label of the count plot
ax.set_title("Number of Real and Fake job posts in the variable employment type ", fontsize = 20)
plt.xlabel("employment type",fontsize=20)
plt.ylabel("count", fontsize=20)

#To Display the count values on top of the countplot
for p in ax.patches:
  ax.annotate(f'\n{p.get_height()}', (p.get_x()+.2, p.get_height()), ha='center', va='bottom', color='black', size=24)


plt.show()

We can observe that most Fake job postings are in the elements Full-time and NULL

## Required Experience

In [None]:
#Plotting the countplot
plt.figure(figsize=(15,15))
ax=sb.countplot(y ="required_experience", hue="fraudulent", data=jobData)

#setting the title and fontsize of the x and y axis label of the count plot
ax.set_title("Number of Real and Fake job posts in the variable required experience ", fontsize = 20)
plt.xlabel("count",fontsize=20)
plt.ylabel("required experience", fontsize=20)

#setting the font size for the Y axis elements 
ax.set_yticklabels(ax.get_yticklabels(),  fontsize=15)

#To Display the count values on top of the countplot
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(int(width),
                ((x + width), y), 
                xytext = (40, -25),
                fontsize = 18, 
                color = '#000000',
                textcoords = 'offset points',
                ha = 'right',
                va = 'center')
    

Most of the Fake job postings are in the elements NULL, Mid-senior level and Entry Level

## Required Education

In [None]:
#Plotting the countplot
plt.figure(figsize=(10,15))
ax=sb.countplot(y ="required_education", hue="fraudulent", data=jobData)

#setting the title and fontsize of the x and y axis label of the count plot
ax.set_title("Number of Real and Fake job posts in the variable required education ", fontsize = 20)
plt.xlabel("count",fontsize=20)
plt.ylabel("required education", fontsize=20)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1, 1.1),title='Fraudulent', loc=2, borderaxespad=0.)


#setting the font size for the Y axis elements 
ax.set_yticklabels(ax.get_yticklabels(),  fontsize=15)

#To Display the count values on top of the countplot
for p in ax.patches:
    width = p.get_width()
    x, y = p.get_xy()
    ax.annotate(float(width),
                ((x + width), y), 
                xytext = (40, -15),
                fontsize = 16,
                color = '#000000',
                textcoords = 'offset points',
                ha = 'center',
                va = 'center')
    

plt.show()

Most of the fake job postings are in the elements NULL, Bachelor's degree and High school or equivalent.

-------------------------------------------------------------------------------

For the variable, industry we would like to know where most of the fake job postings are at.

## Industry

In [None]:
#Extracting the fake job postings data
jobData_industry = jobData[jobData['fraudulent']== 1]

#Able to print all the elements and count
pd.set_option('display.max_rows', jobData.shape[0]+1)

#Print the count of fraudulent data of the elements in 'industry'
print("The number of fake job postings in the following elements are:\n")
print(jobData_industry['industry'].value_counts())

 
 
 Most of the fake job postings in industry lies in the element NULL, Oil and energy

In [None]:
#Extracting the fake job postings data
jobData_function = jobData[jobData['fraudulent']== 1]

#Able to print all the elements and count
pd.set_option('display.max_rows', jobData.shape[0]+1)

#Print the count of fraudulent data of the elements in 'function'
print("The number of fake job postings in the following elements are:\n")
print(jobData_industry['function'].value_counts())

Most of the fake job postings lies in the elements NULL, administrative and Engineering.

# Prediction

From the analysis of the data we feel that the following variables is useful in helping us in the prediction

In [None]:
#Putting the useful 'variables' in predictionData
predictionData = pd.DataFrame(jobData[['telecommuting','has_company_logo','has_questions','employment_type','required_experience','required_education','industry','function','fraudulent']])
predictionData.head()

### Preparing of data

In [None]:
#importing label enconder to convert the elements in each variable to a number
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

#converting each variable's elements into a number 
for i in predictionData:
    if predictionData[i].dtype=='object':
        predictionData[i]=le.fit_transform(predictionData[i])
        
predictionData.head()

In [None]:
from sklearn.model_selection import train_test_split
x = pd.DataFrame(predictionData[['telecommuting','has_company_logo','has_questions','employment_type','required_experience','required_education','industry','function']])
y = pd.DataFrame(predictionData['fraudulent'])

# Split the Dataset into Train and Test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

# Check the sample sizes
print("Train Set :", y_train.shape, x_train.shape)
print("Test Set  :", y_test.shape, x_test.shape)



### Balancing our dataset with SMOTE()

In [None]:
#SMOTE(oversampling of minority data[fake job postings] )
#to install new library imblearn
#!pip install imblearn 

#importing libraries needed for SMOTE
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE()

#fitting the train datas into the SMOTE model for oversampling of minority data
X , Y= sm.fit_resample(x_train,y_train)

# new Dataset after smote
print("Number of real (label as 0) and fake jobs (label as 1) in the new dataset :")
print(Y["fraudulent"].value_counts())

#Count of the new dataset in X train
sb.catplot(x="fraudulent", data = Y, kind = "count")

## Prediction with Random Forest Classifier

### Fitting train data into random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(X,Y.values.ravel())

### Predicting train datas

In [None]:
#prediciting frauduelent with x_train
y_train_predrfc=rfc.predict(X)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Displaying the accuracy score
print("The accuracy score for Random Forest classifier for the train set is:" )
print(accuracy_score(Y,y_train_predrfc))

from sklearn.metrics import classification_report
#Displaying the classification_report
print(classification_report(Y,y_train_predrfc))

# Plot the Confusion Matrix for Train 
sb.heatmap(confusion_matrix(Y, y_train_predrfc), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

### Predicting test datas

In [None]:
#prediciting frauduelent with x_test
y_test_predrfc=rfc.predict(x_test)

#Displaying the accuracy score
print("The accuracy score for Random Forest classifier for the test set is:" )
print(accuracy_score(y_test,y_test_predrfc))

#Displaying the classification_report
print(classification_report(y_test,y_test_predrfc))

sb.heatmap(confusion_matrix(y_test, y_test_predrfc), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

# Prediction with Logistic Regression

In [None]:
#importing logistic regression and setting its iteration to 10000
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='lbfgs', max_iter=10000)

In [None]:
#fitting the train data into the logistic regression model
lr.fit(X,Y.values.ravel())

### Predicting train datas

In [None]:
y_train_predlr=lr.predict(X)

#Displaying the accuracy score
print("The accuracy score for logistic regression for the train set is:" )
print(accuracy_score(Y,y_train_predlr))

#Displaying the classification_report
print(classification_report(Y,y_train_predlr))

sb.heatmap(confusion_matrix(Y, y_train_predlr), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

### Predicting test datas

In [None]:
y_test_predlr=lr.predict(x_test)

#Displaying the accuracy score
print("The accuracy score for logistic regression for the test set is:" )
print(accuracy_score(y_test,y_test_predlr))

#Displaying the classification_report
print(classification_report(y_test,y_test_predlr))

sb.heatmap(confusion_matrix(y_test, y_test_predlr), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

### Prediction with KNeighbours Classifier

In [None]:
#importing and fitting the model for the KNeighbours Classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X,np.ravel(Y))

### Predicting train datas

In [None]:
y_train_predknn=knn.predict(X)
#Displaying the accuracy score
print("The accuracy score for KNeighbours classifier for the train set is:" )
print(accuracy_score(Y,y_train_predknn))

#Displaying the classification_report
print(classification_report(Y,y_train_predknn))

sb.heatmap(confusion_matrix(Y, y_train_predknn), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

### Predicting test datas

In [None]:
y_test_predknn=knn.predict(x_test)
print("The accuracy score for KNeighbours classifier for the test set is:" )
print(accuracy_score(y_test,y_test_predknn))

#Displaying the classification_report
print(classification_report(y_test,y_test_predknn))

sb.heatmap(confusion_matrix(y_test, y_test_predknn), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

### Which Classfier is best to predict if a job postings is fraudulent or not.

From the f1 score, The Random Forest Classifier gives us the best score of 0.76. It also has the highest accuracy of 93%  Thus, it is the better classifier out of the other 2. 