In [1]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from gensim import utils
import gensim.parsing.preprocessing as gsp
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

In [2]:
fake_real_data = pd.read_csv("fake_job_postings.csv")

In [3]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
fake_real_data.iloc[0,6]

'Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systemsResearching blogs and websites for the Provisions by Food52 Affiliate ProgramAssisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiriesSupporting with PR &amp; Events when neededHelping with office administrative work, such as filing, mailing, and preparing for meetingsWorking with developers to document bugs and suggest improvements to the siteSupporting the marketing and executive staff'

In [5]:
fake_real_data = fake_real_data.dropna()

In [6]:
#Gensim preprocessing filters to remove numberc values, tags, punctuation, multiple whitespaces and stopwords
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]
#Function to clean the description
def clean_text(data):
    data = data.lower()
    data = utils.to_unicode(data)
    for fil in filters:
        data = fil(data)
    return data

In [7]:
#Example run of clean text
clean_text(fake_real_data.iloc[0,6])

'respons manag english speak editori team build team best class editorsset content creation schedul ensur deadlin adher toresearch write latest tech topic new relat android ecosystemensur content site consist high qualityb face voic url adbddeccedeefeceeaa'

In [8]:
#Split the data into x and Y
train_data = fake_real_data['description']
test_data = fake_real_data['fraudulent']
print("len of df_x",len(train_data))
print("len of df_y",len(test_data))


len of df_x 774
len of df_y 774


In [9]:
#Vectorize the attributes using TF IDF format to weight each word in a document
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer()
#Clean the text and remove stop words
train_data = train_data.apply(lambda x : clean_text(x))
#Fit the model and transform based of TFIDF vectorization
tfidf_vectors = tfidf_transformer.fit(train_data).transform(train_data)

In [10]:
tfidf_vectors.shape

(774, 6226)

In [12]:
#Use pipeline to classify documents based on the attributes and find the accuracy score with respect to data_y
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
predict_tf_idf = Pipeline(steps=[('tfidf',tfidf_transformer),
                         ('xgboost', XGBClassifier(objective='binary:logistic'))])
#cross validate the prediction of pipeline with test column
scores = cross_val_score(predict_tf_idf, train_data, test_data, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier using description : ', scores.mean())

Accuracy for Tf-Idf & XGBoost Classifier using description :  0.9586761625471303


In [13]:
train_data = fake_real_data['company_profile']
test_data = fake_real_data['fraudulent']
print("len of df_x",len(train_data))
print("len of df_y",len(test_data))

len of df_x 774
len of df_y 774


In [14]:
tfidf_transformer = TfidfVectorizer()
#Clean the text and remove stop words
train_data = train_data.apply(lambda x : clean_text(x))
#Fit the model and transform based of TFIDF vectorization
tfidf_vectors = tfidf_transformer.fit(train_data).transform(train_data)

In [15]:
tfidf_vectors.shape

(774, 3025)

In [16]:
predict_tf_idf = Pipeline(steps=[('tfidf',tfidf_transformer),
                         ('xgboost', XGBClassifier(objective='binary:logistic'))])
scores = cross_val_score(predict_tf_idf, train_data, test_data, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier using company profile : ', scores.mean())

Accuracy for Tf-Idf & XGBoost Classifier using company profile :  0.9935483870967742
