In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#general imports

from collections import Counter
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd
from PIL import Image
import re
import seaborn as sns
import string
from wordcloud import WordCloud

#SKL imports
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

#NLTK imports
import nltk
from nltk.collocations import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('names')

# Explore data

In [None]:
job_data = pd.read_csv('../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv', sep=',')
percent_missing_values = job_data.isnull().sum() * 100 / len(job_data)
missing_values_df = pd.DataFrame({'percent_missing': percent_missing_values})
missing_values_df.sort_values('percent_missing', inplace=True)
missing_values_df

So at first glance, there are quite a few variables with a large percentage of NaNs. With missing values in the text columns making any valid NLP is hard. For that reason, I will only look at description, requirements and company profile and the variables without missing values

In [None]:
job_data.head()

In [None]:
print(f'Number of unique functions in this dataset {len(job_data.function.unique())}')
print(f'Number of unique locations in this dataset {len(job_data.location.unique())}')
print(f'Number of unique departments in this dataset {len(job_data.department.unique())}')
print(f'Number of unique employment types in this dataset {len(job_data.employment_type.unique())}')
print(f'Number of unique required education in this dataset {len(job_data.required_education.unique())}')
print(f'Number of unique industries in this dataset {len(job_data.industry.unique())}')

# Clean text columns and create bigram columns

In [None]:
def clean_text(text):
    if text is not None:
        text = text.lower()
        regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
        no_punct = regex.sub(" ", str(text))
        #no_2_letters = re.sub(r'\b\w{1,2}\b', '', no_punct)
        no_special_characters = re.sub('[^A-Za-z0-9]+', ' ', no_punct)
        lemmatizer = WordNetLemmatizer()
        word_list = nltk.word_tokenize(no_punct)
        lemmatized = ' '.join([lemmatizer.lemmatize(w) for w in word_list])

        return lemmatized
    else:
        return None

#clean description
job_data['nan_count'] = job_data.isnull().sum(axis=1)
job_description = job_data[['job_id','description','fraudulent']]
job_description = job_description.dropna(how ='any', subset=['description'])#drop NaNs, but modifying the clean_text function is a second possibility
job_description['clean_description'] = job_description.description.apply(lambda x: clean_text(x))
job_description['bigrams'] = job_description.clean_description.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))

#-----------------------------------------------------------------------------------------------------------------------------------------------
#clean company profile
job_cp = job_data[['job_id','company_profile','fraudulent']]
job_cp = job_cp.dropna(how ='any', subset=['company_profile'])
job_cp['clean_company_profile'] = job_cp.company_profile.apply(lambda x: clean_text(x))
job_cp['bigrams_cp'] = job_cp.clean_company_profile.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))

#-----------------------------------------------------------------------------------------------------------------------------------------------
job_req = job_data[['job_id','requirements','fraudulent']]
job_req = job_req.dropna(how ='any', subset=['requirements'])
job_req['clean_requirements'] = job_req.requirements.apply(lambda x: clean_text(x))
job_req['bigrams_r'] = job_req.clean_requirements.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))

#-----------------------------------------------------------------------------------------------------------------------------------------------
job_description = pd.merge(job_description, job_cp, on='job_id', how='outer')
job_description = job_description.drop(['fraudulent_y'], axis=1)
job_description = job_description.rename({'fraudulent_x':'fraudulent'}, axis=1)
job_description['fraudulent'] = job_description['fraudulent'].astype(int)
job_description = pd.merge(job_description, job_req, on='job_id', how='outer')
job_description = job_description.drop(['fraudulent_y'], axis=1)
job_description = job_description.rename({'fraudulent_x':'fraudulent'}, axis=1)

In [None]:
job_description.head()

# Look at single grams

In [None]:
#look at the frequency distribution of single words
stop_words = stopwords.words('english')
word_count_all = Counter()
word_count_fraud = Counter()

job_description['clean_description'].str.split().apply(word_count_all.update)
job_description[job_description['fraudulent']==1]['clean_description'].str.split().apply(word_count_fraud.update)


for word in stop_words:#delete all words that are in the stopword list
    del word_count_all[word]
    del word_count_fraud[word]

 

f, axarr = plt.subplots(1,2, figsize=(20,20))

wc_all = WordCloud(max_words=30,background_color="white").generate_from_frequencies(word_count_all)
plt.title("Wordcloud all")
axarr[0].imshow(wc_all)

wc_fraud = WordCloud(max_words=30,background_color="white").generate_from_frequencies(word_count_fraud)
plt.title("Wordcloud fraud ")
axarr[1].imshow(wc_fraud)


Well the single words do not seem to show anything out of the ordinary, at least not in the description. So I will have a look at the bigrams and hopefully find some indicators for fraudulent job postings. 

# Bigrams in the description field and company profile field

In [None]:
bigram_list_all = list([a for b in job_description.bigrams.tolist() for a in b])
bigram_dict_all = {}
bigram_dict_all = Counter((bigram_list_all))

bigram_list_non_fraud = list([a for b in job_description[job_description['fraudulent']==0].bigrams.tolist() for a in b])
bigram_dict_non_fraud = {}
bigram_dict_non_fraud = Counter((bigram_list_non_fraud))

bigram_list_fraud = list([a for b in job_description[job_description['fraudulent']==1].bigrams.tolist() for a in b])
bigram_dict_fraud = {}
bigram_dict_fraud = Counter((bigram_list_fraud))

bigram_list_non_fraud_cp = list([a for b in job_cp[job_cp['fraudulent']==0].bigrams_cp.tolist() for a in b])
bigram_dict_non_fraud_cp = {}
bigram_dict_non_fraud_cp = Counter((bigram_list_non_fraud_cp))

bigram_list_fraud_cp = list([a for b in job_cp[job_cp['fraudulent']==1].bigrams_cp.tolist() for a in b])
bigram_dict_fraud_cp = {}
bigram_dict_fraud_cp = Counter((bigram_list_fraud_cp))

bigram_list_non_fraud_r = list([a for b in job_req[job_req['fraudulent']==0].bigrams_r.tolist() for a in b])
bigram_dict_non_fraud_r = {}
bigram_dict_non_fraud_r = Counter((bigram_list_non_fraud_r))

bigram_list_fraud_r = list([a for b in job_req[job_req['fraudulent']==1].bigrams_r.tolist() for a in b])
bigram_dict_fraud_r = {}
bigram_dict_fraud_r = Counter((bigram_list_fraud_r))


In [None]:
#print bigrams in description of non fraudulent postings
counter = 0
for element in sorted(bigram_dict_non_fraud.items(), key=lambda x: x[1], reverse=True):
    if counter < 25:
        if (element[0][0] not in stop_words) & (element[0][1] not in stop_words):
            print(element)
            counter +=1

In [None]:
#print bigrams in description of fraudulent postings
counter = 0
for element in sorted(bigram_dict_fraud.items(), key=lambda x: x[1], reverse=True):
    if counter < 25:
        if (element[0][0] not in stop_words) & (element[0][1] not in stop_words):
            print(element)
            counter +=1
            
#bigram_dict_non_fraud[('data', 'entry')]

So by checking the bigrams that most frequently occur, I could discover some bigrams that are only used in fraudulent text. The bigram 'Aker Solution' only appeared in fraudulent job postings. This is actually the firm name. A second bigram that pops up is the bigram ' 6 ultra' or the follow up 'ultra luxury', which is used by a supposedly american cruise company. The third bigram, that I found, was 'application onlyclick' which is an error since only and click should have been written as two separate words. But this type of error, that two words are written together is very common, probably happened during the webcrawl and writing it to a csv. The description of this job is very suspicious and revealed another firm that only post fraudulent jobs. With this bigram another bigram is very related, namely 'data entry', but this bigram is also found in non fraudulent job postings, but relatively seldom. We might find something that has more discriminating power.

In [None]:
#print bigrams in company profile of non fraudulent postings
counter = 0
for element in sorted(bigram_dict_non_fraud_cp.items(), key=lambda x: x[1], reverse=True):
    if counter < 25:
        if (element[0][0] not in stop_words) & (element[0][1] not in stop_words):
            print(element)
            counter +=1

In [None]:
#print bigrams in company profile of fraudulent postings
counter = 0
for element in sorted(bigram_dict_fraud_cp.items(), key=lambda x: x[1], reverse=True):
    if counter < 25:
        if (element[0][0] not in stop_words) & (element[0][1] not in stop_words):
            print(element)
            counter +=1

Here again a few very useful bigrams could be detected. The first bigram, which is used in 56 fake job postings and 0 real job postings, is 'signing bonus'. Apparently using this in the company profile is not serious or at least not used in real job postings in this dataset. Also, it sounds like a way to attract people to apply. Furthermore, the bigrams 'represented candidate' and 'solid geopgraphical' are only used in fake job postings.  

So in the requirements there are also a few bigrams spotted, which are quite strong discriminators. 'amp personal' is a weird bigram, but it happens mostly in fraudulent job postings. This definitly has something to do with html parsing, but it could still be seen as 'pattern'. Also 'data entry' is found here, which is a skill that is not often mentioned in non fraudulent job postings. Another skill, that is probably related to data entry is typing skill. Just something that is not written in a real job posting. So below we will 'translate' these bigram features into machine learning interpretable features. I excluded the code for printing the bigrams, since, as we will see below, the bigrams in the description and mostly in the company profile already have the strongest relation to the target.  

In [None]:
# let's check how many fraudulent job postings can be found only using these bigram rules:
len(job_description[(job_description['clean_company_profile'].str.contains('signing bonus')) |\
                    (job_description['clean_company_profile'].str.contains('represented candidate'))|\
                    (job_description['clean_company_profile'].str.contains('solid geographical'))|
                    (job_description['clean_description'].str.contains('aker solution'))|\
                    (job_description['clean_description'].str.contains('good english'))])

Actually not that bad, by only introducing 5 rules or one-hot encoded variables, any classifier would at least find 181 fraudulent job postings. So the baseline model (which in this case is rulebased) here actually reaches an accuracy of: (17014(number of real job postings) + 181 (rules based identifiable fake job postings))/17880 = 96.2%. But, as mentioned before in other kernels, accuracy is an inappropriate measure for how good a model is, when the data set is highly unbalanced. We will look at precision and recall when comparing algorithm. Especially the recall in fraudulent job postings should be the main target, without offering to much of precision.   

# Prepare dataframe for ML

In [None]:
def contains_bigram(description):
    if 'aker solution' in description.lower():
        return 1
    if 'good english' in description.lower():
        return 1
#    if '#EMAIL' in description.lower():
#        return 1
    else:
        return 0
    
def contains_data_entry(description):
    if 'data entry' in description.lower():
        return 1
    else:
        return 0

def cp_contains_bigram(description):
    if 'signing bonus' in description.lower():
        return 1
    if 'represented candidate' in description.lower():
        return 1
    if 'solid geographical' in description.lower():
        return 1
    else:
        return 0

def bigram_r(requirements):
    if 'typing skill ability' in requirements.lower():#yes, cheated here. I found this trigram while looking at the bigram
        return 1
    if 'data entry' in requirements.lower():
        return 1
    if 'qualification amp personal' in requirements.lower():
        return 1
    else:
        return 0    
    
def cap_word_count(description):
    count = len(re.findall(r'([A-Z]{4,})', description))
    if count > 2:
        return 1
    else:
        return 0

def descr_length(lenght):
    if lenght <=150:
        return 1
    else:
        return 0

def nan_counter(count):
    if count >=5:
        return 1
    else:
        return 0

def bigram_occurence(bigrams):
    output = defaultdict(int)
    for element1, element2 in bigrams:
        if (element1 not in stop_words) & (element2 not in stop_words):
            output[element1,element2]+= 1
    output = sorted(output.items(), reverse=True, key=lambda item: item[1])
    
    return output

def non_stopword_percentage(bigram_occ, bigrams):
    sum_bigram_occ=0
    for elem in bigram_occ:
        sum_bigram_occ= sum_bigram_occ + elem[1]
    num_bigrams = len(bigrams)
    if num_bigrams > 0:
        return(round(sum_bigram_occ/num_bigrams*100,0))
    else:
        return 0

def repeated_bigrams(bigram_occ):
    repeated = 0
    for element in bigram_occ:
        if element[1] > 1:
            repeated +=1
    return repeated       
    
def rep_classes(repetition):
    if repetition <=3:
        return 1
    if repetition == 4:
        return 2
    if repetition > 4:
        return 3
    
    
job_data['nan_5'] = job_data.nan_count.apply(lambda x: nan_counter(x))
description_var = job_data[['job_id', 'description']]
description_var = description_var.dropna(how ='any', subset=['description'])
description_var['char_length'] = description_var.description.str.len()
description_var['length_150'] = description_var.char_length.apply(lambda x: descr_length(x))
description_var['contains_bigram'] = description_var.description.apply(lambda x: contains_bigram(x))
description_var['contains_data_entry'] = description_var.description.apply(lambda x: contains_data_entry(x))
description_var['cap_word_count'] = description_var.description.apply(lambda x: cap_word_count(x))
description_var['bigrams'] = description_var.description.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))
description_var['bigram_occ'] = description_var.bigrams.apply(lambda x: bigram_occurence(x))
description_var['non_stopword_bigrams%'] = description_var.apply(lambda x: non_stopword_percentage(x.bigram_occ, x.bigrams), axis=1)
description_var['repetition'] = description_var.bigram_occ.apply(lambda x: repeated_bigrams(x))
description_var['rep_class'] = description_var.repetition.apply(lambda x: rep_classes(x))

#--------------------------------------------------------------------------------------------------------------------
cp_var = job_data[['job_id', 'company_profile']]
cp_var = cp_var.dropna(how ='any', subset=['company_profile'])
cp_var['char_length_cp'] = cp_var.company_profile.str.len()
cp_var['contains_bigram_cp'] = cp_var.company_profile.apply(lambda x: cp_contains_bigram(x))
cp_var['bigrams_cp'] = cp_var.company_profile.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))
cp_var['bigram_occ_cp'] = cp_var.bigrams_cp.apply(lambda x: bigram_occurence(x))
cp_var['non_stopword_bigrams_cp%'] = cp_var.apply(lambda x: non_stopword_percentage(x.bigram_occ_cp, x.bigrams_cp), axis=1)
cp_var['repetition_cp'] = cp_var.bigram_occ_cp.apply(lambda x: repeated_bigrams(x))
cp_var['rep_class_cp'] = cp_var.repetition_cp.apply(lambda x: rep_classes(x))
#--------------------------------------------------------------------------------------------------------------------

r_var = job_data[['job_id', 'requirements']]
r_var = r_var.dropna(how ='any', subset=['requirements'])
r_var['r_length'] = r_var.requirements.str.len()
r_var['contains_bigram_r'] = r_var.requirements.apply(lambda x: bigram_r(x))
r_var['bigrams_r'] = r_var.requirements.apply(lambda row: list(nltk.ngrams(row.split(' '),2)))
r_var['bigram_occ_r'] = r_var.bigrams_r.apply(lambda x: bigram_occurence(x))
r_var['non_stopword_bigrams_r%'] = r_var.apply(lambda x: non_stopword_percentage(x.bigram_occ_r, x.bigrams_r), axis=1)
r_var['repetition_r'] = r_var.bigram_occ_r.apply(lambda x: repeated_bigrams(x))

#--------------------------------------------------------------------------------------------------------------------
result = pd.merge(job_data, description_var, on='job_id', how='outer')
result = pd.merge(result, cp_var, on='job_id', how='outer')
result = pd.merge(result, r_var, on='job_id', how='outer')
result = result.fillna({'contains_bigram':2, 'contains_bigram_cp':2, 'contains_data_entry':2, 'length_150':2, 'cap_word_count':2, 'contains_bigram_r':2, 'rep_class':4,'rep_class_cp':4}) # code NaNs as a 2 for the 'binary' variables and for categorical variables take 4
result[['contains_bigram','contains_bigram_cp','contains_data_entry','length_150','cap_word_count','contains_bigram_r']] = result[['contains_bigram', 'contains_bigram_cp','contains_data_entry','length_150','cap_word_count', 'contains_bigram_r']].astype(int)

In the above cell I tried to translate some ideas on possible features into ML algorithm interpretable features. The first four functions check whether the before identified bigrams are in the description, company profile or requirements text. Another feature, that I thought could be interesting, is the number of capitalised words <span style="color:blue">(cap_word_count)</span>. Since the word URL is contained in almost all descriptions, I only looked for words containing at least 4 characters. Since fraudulent job postings should attract as many people as possible, maybe they use more capitalisation to do so. I looked at the histograms and did not really find a pattern, but maybe somebody finds an appropriate binning strategy. Furthermore I checked the length of the description <span style="color:blue">(descr_length)</span> and the number of columns per row with NaN <span style="color:blue">(nan_counter)</span>. I couldn't really find a pattern, so I made a boolean based on the histograms. These variables did a quite poore job in the prediction. But maybe somebody get's another creative idea, that is why I left them in this notebook. In the function <span style="color:blue">(bigram_occurence)</span> I looked at the number of bigrams which does not contain a stopword. This column is then used in the function <span style="color:blue">(non_stopword_percentage)</span>, how high the percentage of non-stop-word-bigrams is to all bigrams in the text. The histogram showed an almost identical pattern for fraudulent and non fraudulent job postings. The last functions (<span style="color:blue">(repeated_bigrams)</span> &<span style="color:blue">(rep_classes)</span> check how often bigrams are repeated, since I thougth, fraudulent job postings may be less creative and use the same phrase/words more often. This last variable repetition is actually something interesting, since it would be a more global pattern, by which I mean, that the contain_bigram functions are really good for this dataset, but on datasets in different languages have absolutely no chance. The repetition variable could also work in different languages. I did found a pattern, but it is not so strong. Nevertheless, I think it is worth using the variable and maybe somebody finds a better binning strategy. 

# Logistic Regression and Random Forest Classifier

In [None]:
#check influence categorical variable on target variable
x = result.rep_class_cp.values.reshape(-1, 1)
clf = LogisticRegression(random_state=0).fit(x, result.fraudulent)
clf.coef_

In [None]:
#Pick variables that could be used
result = result[['telecommuting','has_company_logo','fraudulent','contains_bigram','contains_data_entry','contains_bigram_cp','cap_word_count','contains_bigram_r','rep_class_cp','rep_class']]


In [None]:
#do a simple logistic regression, just to see how it performs
X_train, X_test, y_train, y_test = train_test_split\
                                    (result[['contains_bigram','contains_data_entry','contains_bigram_cp','contains_bigram_r','rep_class_cp','rep_class']],\
                                     result['fraudulent'],\
                                     test_size=0.25,random_state=109)

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

So the performance of a simple logistic regression is not that bad, with a recall of .327 on fraudulent job postings. (Rulebased = 181/866 = .209). But it is far from great. 

In [None]:
#Use the grid search to find the best parameters for the rfc
rfc=RandomForestClassifier(random_state=24)

param_grid = { 
    'n_estimators': [50,100,150, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,4,5,6,7,8],
    'criterion' :['gini']
    }

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

CV_rfc.best_params_

In [None]:
clf = RandomForestClassifier(n_jobs=2, random_state=0,max_depth = 4, criterion='gini', n_estimators=150).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))
print(list(zip(X_train.columns, clf.feature_importances_)))

So the random forest does a better job, but in terms of recall, it is only slightly better than the logistic regression. I am well aware, that there are many more parameters to tune, but my foremost goal was to think about possible variables and look how they perform in simple ML algorithms. From my perspective the most informative column is actually the company profile column, since I found the best dicriminating variables by using this column. This is a surprise for me, since I expected to find the best information in the discription column. 

Thanks for reading this notebook, if you find any error, have any questions or any other feedback, please let me know!