In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing the Data into pandas Data Frame**

In [None]:
df = pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')
df.head()

**Remove Unrelated Attributes**

In [None]:
df.columns

In [None]:
df = df.drop(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function'], axis=1)

In [None]:
df.head()

**DropNa**

In [None]:
df.dropna(inplace=True) 

**Change Cols Names**

In [None]:
df.columns = ['data', 'labels']
df.head()

**Lowecasing All Letters**

In [None]:
df['data'] = df['data'].str.lower()
df.data[0]

**Text Pre-processing**

In [None]:
import nltk 

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
df['data'][1]

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
stops = set(stopwords.words("english"))    
def identify_tokens(row):
    review = row
    tokens = nltk.word_tokenize(review)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    meaningful_words = [w for w in token_words if not w in stops]
    stemmed_list = [stemming.stem(word) for word in meaningful_words]
    joined_words = (' '.join(stemmed_list))
    return joined_words

In [None]:
df['processed'] = df['data'].apply(identify_tokens)

In [None]:
df['processed'][1]

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
Y = df['labels'].to_numpy()

**TF-IDF Feature Extraction**

In [None]:
tfidf = TfidfVectorizer(decode_error='ignore')
X = tfidf.fit_transform(df['processed'].values.astype('U'))
X.shape

In [None]:
df['processed'][17876]

**Train/ Test Splitting**

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.3)

**Multi-Nomial Naive Bayes**

In [None]:
modelNB = MultinomialNB()
modelNB.fit(xtrain,ytrain)

In [None]:
NBPreds = modelNB.predict(xtest)
print('Confusion Matrix')
print(confusion_matrix(ytest, NBPreds))

print('Classification Report')
print(classification_report(ytest, NBPreds))

print("Accuracy:",metrics.accuracy_score(ytest, NBPreds))
print("TNR:",metrics.precision_score(ytest, NBPreds))
print("NPV:",metrics.recall_score(ytest, NBPreds))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

**Support Vector Machine**

In [None]:
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(xtrain,ytrain)

In [None]:
SVMpred = clf.predict(xtest)

In [None]:
print('Confusion Matrix')
print(confusion_matrix(ytest, SVMpred))
print('Classification Report')
print(classification_report(ytest, SVMpred))
print("Accuracy:",metrics.accuracy_score(ytest, SVMpred))
print("TNR:",metrics.precision_score(ytest, SVMpred))
print("NPV:",metrics.recall_score(ytest, SVMpred))

**Decision Tree**

In [None]:
DTC = DecisionTreeClassifier()
DTC = DTC.fit(xtrain,ytrain)
DTCPred = DTC.predict(xtest)

In [None]:
print('Confusion Matrix')
print(confusion_matrix(ytest, DTCPred))
print('Classification Report')
print(classification_report(ytest, DTCPred))
print("Accuracy:",metrics.accuracy_score(ytest, DTCPred))
print("TNR:",metrics.precision_score(ytest, DTCPred))
print("NPV:",metrics.recall_score(ytest, DTCPred))

**K-Nearest Neighbors**

In [None]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(xtrain,ytrain)
KNNPred= KNN.predict(xtest)

In [None]:
print('Confusion Matrix')
print(confusion_matrix(ytest, KNNPred))
print('Classification Report')
print(classification_report(ytest, KNNPred))
print("Accuracy:",metrics.accuracy_score(ytest, KNNPred))
print("TNR:",metrics.precision_score(ytest, KNNPred))
print("NPV:",metrics.recall_score(ytest, KNNPred))

**Random Forest**

In [None]:
RF=RandomForestClassifier(n_estimators=100) #default number of trees in the forest model 
RF.fit(xtrain,ytrain)
RFPred=RF.predict(xtest)
print('Confusion Matrix')
print(confusion_matrix(ytest, RFPred))
print('Classification Report')
print(classification_report(ytest, RFPred))
print("Accuracy:",metrics.accuracy_score(ytest, RFPred))
print("TNR:",metrics.precision_score(ytest, RFPred))
print("NPV:",metrics.recall_score(ytest, RFPred))

**This is Paper-Related Code.**

Paper:

**Nasser, Ibrahim, and Amjad H. Alzaanin. "Machine Learning and Job Posting Classification: A Comparative Study." International Journal of Engineering and Information Systems (IJEAIS) ISSN (2020): 6-14.**