## Importing the libraries 

In [1]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from nltk.tokenize import RegexpTokenizer
import nltk
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix 
from imblearn.combine import SMOTETomek
import string
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

nltk.download('stopwords')


stop_words = set(stopwords.words("english"))
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')
default_tokenizer=RegexpTokenizer(r"\w+")

  from pandas import MultiIndex, Int64Index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sadpa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preparation for modelling

In [2]:
data = pd.read_csv("fake_job_postings.csv")
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
data.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [4]:
data = data.replace(np.nan, '', regex=True)

In [5]:
data.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [6]:
data.dtypes

job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

In [7]:
data['text'] = data[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

In [8]:
data.drop(['job_id', 'location','title','salary_range' ,'department','salary_range','company_profile','description','requirements','benefits'], axis=1, inplace=True)

In [9]:
data.columns
data_columns = data.columns.tolist()


In [10]:
from sklearn.preprocessing import LabelEncoder
label_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function']
lb_make = LabelEncoder()
for i in label_columns:
  data[i] = lb_make.fit_transform(data[i])

In [11]:
data_columns = data_columns[-1:] + data_columns[:-1]
data = data[data_columns]
data.head()

Unnamed: 0,text,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,"Marketing Intern Marketing We're Food52, and w...",0,1,0,3,5,0,0,23,0
1,Customer Service - Cloud Video Production Succ...,0,1,0,2,7,0,76,8,0
2,Commissioning Machinery Assistant (CMA) Valor...,0,1,0,0,0,0,0,0,0
3,Account Executive - Washington DC Sales Our pa...,0,1,0,2,6,2,23,32,0
4,Bill Review Manager SpotSource Solutions LLC ...,0,1,1,2,6,2,52,17,0


In [12]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [13]:
## Defining the utility functions

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',str(text))


def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))


def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',str(text))


def remove_punctuation(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def final_preprocess(text):
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = ' '.join(e for e in text.split() if e.lower() not in stopwords)
    text = text.lower()
    ps = PorterStemmer()
    text = ps.stem(text)
    return text
    

In [14]:
#Cleaning

data['text'] = remove_URL(str(data['text']))
data['text'] = remove_emoji(str(data['text']))
data['text'] = remove_html(str(data['text']))
data['text'] = remove_punctuation(str(data['text']))
data['text'] = final_preprocess(str(data['text']))

In [15]:
data.head()

Unnamed: 0,text,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,0 0 0 0 0 marketing 1 0 0 0 0 marketing 2 0 0 ...,0,1,0,3,5,0,0,23,0
1,0 0 0 0 0 marketing 1 0 0 0 0 marketing 2 0 0 ...,0,1,0,2,7,0,76,8,0
2,0 0 0 0 0 marketing 1 0 0 0 0 marketing 2 0 0 ...,0,1,0,0,0,0,0,0,0
3,0 0 0 0 0 marketing 1 0 0 0 0 marketing 2 0 0 ...,0,1,0,2,6,2,23,32,0
4,0 0 0 0 0 marketing 1 0 0 0 0 marketing 2 0 0 ...,0,1,1,2,6,2,52,17,0


Download glove.840B.300d.txt from https://www.kaggle.com/takuok/glove840b300dtxt

In [17]:
## Making a dictionary of the words and their vector representation

embeddings_index = {}
f = open('glove.840B.300d.txt', encoding = 'utf-8')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vectors representing the embedding for the word
    embeddings_index[word] = coefs
f.close()


In [18]:
glove_words =  set(embeddings_index.keys())

'''
Below is a uliity function that takes sentenes as a input and return the vector representation of the same
Method adopted is similar to average word2vec. Where i am summing up all the vector representation of the words from the glove and 
then taking the average by dividing with the number of words involved
'''

def convert_sen_to_vec(sentence):
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence
    for word in sentence.split():
        if word in glove_words:
            vector += embeddings_index[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words
    return vector

In [19]:
converted_data = []

for i in range(0, data.shape[0]):
    converted_data.append(convert_sen_to_vec(data['text'][i]))

# data['text'] = data['text'].apply(lambda x : convert_sen_to_vec(x))
# converted_data = data['text']

# df1 = pd.DataFrame(converted_data)
# #data.drop(["text"], axis=1, inplace=True)
# main_data = pd.concat([df1,data], axis=1)

_1 = pd.DataFrame(converted_data)

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

data[['required_education', 'required_experience', 'employment_type']] = StandardScaler().fit_transform(data[['required_education', 'required_experience', 'employment_type']])


In [21]:
data.drop(["text"], axis=1, inplace=True)
main_data = pd.concat([_1,data], axis=1)

In [22]:
main_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,299,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,-0.244736,0.450755,-0.129216,0.058018,-0.178774,-0.024379,-0.190768,-0.439917,0.358415,0.494303,...,0.18092,0,1,0,1.290338,0.965807,-0.756563,0,23,0
1,-0.244736,0.450755,-0.129216,0.058018,-0.178774,-0.024379,-0.190768,-0.439917,0.358415,0.494303,...,0.18092,0,1,0,0.32104,1.731617,-0.756563,76,8,0
2,-0.244736,0.450755,-0.129216,0.058018,-0.178774,-0.024379,-0.190768,-0.439917,0.358415,0.494303,...,0.18092,0,1,0,-1.617558,-0.948718,-0.756563,0,0,0
3,-0.244736,0.450755,-0.129216,0.058018,-0.178774,-0.024379,-0.190768,-0.439917,0.358415,0.494303,...,0.18092,0,1,0,0.32104,1.348712,-0.086046,23,32,0
4,-0.244736,0.450755,-0.129216,0.058018,-0.178774,-0.024379,-0.190768,-0.439917,0.358415,0.494303,...,0.18092,0,1,1,0.32104,1.348712,-0.086046,52,17,0


In [23]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

### Now since fraudenlent data is only 5%. So we need to upsample the data

In [25]:
from imblearn.over_sampling import SMOTE

In [28]:
smk  = SMOTETomek(random_state=42)
X_res,Y_res= smk.fit_resample(X,Y)

## Modelling and Measuring performance with F1 Score(Using XgBoost)

In [29]:

xgb_model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=300)

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,10,20,40]
        }

clf = RandomizedSearchCV(xgb_model, param_distributions=params, 
                         n_iter=10, scoring='f1', cv=4)
clf.fit(X_res, Y_res)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




RandomizedSearchCV(cv=4,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=0.02,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                           num_parallel_tree=None,
                                           predictor=None, random_state=None,
                                    

In [30]:
print("Average of the best f1-score in various folds during cross validation = ",clf.best_score_)
print("The best parameters found during k-fold cross validation is = ",clf.best_params_)

Average of the best f1-score in various folds during cross validation =  0.9328959563970404
The best parameters found during k-fold cross validation is =  {'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 20, 'gamma': 1, 'colsample_bytree': 0.8}
