In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#set the common figure size
plt.rcParams['figure.figsize']=(10,8)

In [None]:
df=pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False)

In [None]:
#df['department'].isnull().sum()
len(df['department'])

In [None]:
#percentage of null values for each features
for i in df.columns.tolist():
    if df[i].isnull().sum()>0:
        print('Feature',i,': {:.2f}%'.format(df[i].isnull().sum()*100/len(df[i])))

In [None]:
df['title'].value_counts()

In [None]:
df['location'].value_counts()

In [None]:
df['department'].value_counts()

In [None]:
df['fraudulent'].value_counts()

**There are 866 fake job profiles**

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='telecommuting',data=df)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='telecommuting',hue='fraudulent',data=df)

**from the above figure it is clear that most of the job profiles which does not have any telecommuting are not fake**

In [None]:
from collections import Counter
Counter(df['has_company_logo'])

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='has_company_logo',hue='fraudulent',data=df)

In [None]:
df['has_questions'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='has_questions',hue='fraudulent',data=df)

In [None]:
df['employment_type'].value_counts()

In [None]:
df['employment_type'].isnull().sum()

In [None]:
df.dropna(subset=['employment_type'],axis=0,inplace=True)

In [None]:
df.shape

In [None]:
df['required_experience'].value_counts()

In [None]:
df.dropna(subset=['required_experience'],axis=0,inplace=True)

In [None]:
df['required_education'].value_counts()

In [None]:
df.dropna(subset=['required_education'],axis=0,inplace=True)

In [None]:
#removing unnecessary columns
features=['job_id','title','location','department','salary_range','company_profile','description','requirements','benefits','industry','function']


In [None]:
df.drop(features,axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
categorical=df.select_dtypes('object').columns.tolist()
categorical

In [None]:
#converting the categorical values into numerical values
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in categorical:
    df[i]=le.fit_transform(df[i])

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
sns.heatmap(df.corr(),annot=True,fmt='.2f')

In [None]:
df['fraudulent'].value_counts()

**The dataset is an imbalanced dataset. So RandomOverSampling should be applied on the dataset**

In [None]:
#dividing the independent and dependent dataset
X=df.drop('fraudulent',axis=1)
y=df['fraudulent']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc=ExtraTreesClassifier()
etc.fit(X,y)
etc.feature_importances_
score=pd.Series(etc.feature_importances_)
score.index=X.columns

In [None]:
score.nlargest().plot(kind='bar')

In [None]:
#normalize the dataset
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_norm=ss.fit_transform(X)

# Random Over Sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
rs=RandomOverSampler(sampling_strategy='minority')
X_over,y_over=rs.fit_resample(X_norm,y)

In [None]:
print(Counter(y))
print(Counter(y_over))

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_over,y_over,test_size=0.15,random_state=42)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
Counter(y_test)

# Model Creation

In [None]:
from sklearn.tree import DecisionTreeClassifier
dct=DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,y_train)
yhat=dct.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report,precision_score,recall_score
print('Accuracy: ',accuracy_score(y_test,yhat))
print('F1 score: ',f1_score(y_test,yhat))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,cmap='Blues',fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
print('Precision Score: ',precision_score(y_test,yhat))
print('Recall Score: ',recall_score(y_test,yhat))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=250,random_state=1)
rfc.fit(X_train,y_train)
yhat=rfc.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))
print('F1 score: ',f1_score(y_test,yhat))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,cmap='Blues',fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
print('Precision Score: ',precision_score(y_test,yhat))
print('Recall Score: ',recall_score(y_test,yhat))

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
yhat=svc.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))
print('F1 score: ',f1_score(y_test,yhat))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,cmap='Blues',fmt='.0f')

In [None]:
print(classification_report(y_test,yhat))

In [None]:
print('Precision Score: ',precision_score(y_test,yhat))
print('Recall Score: ',recall_score(y_test,yhat))