In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Welcome! In this notebook, I will be going through a complete data science workflow starting with data cleaning, EDA, and various model interations! Enjoy!

**Model Results**

|Model|AUC Score|
|---|---|
|Baseline - Logistic Regression using TFIDF data| 0.58|
|Logistic Regression using Count Vectorizer Data| 0.55|
|KNN| 0.58|
|SVC| 0.53|
|Random Forest| 0.52|
|Neural Network - MLPClassifier w/ 'lbfgs'| 0.69|
|**Neural Network - MLPClassifer w/ 'adam'**|**0.727**|

In [None]:
#importing the holy trinity of data science packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

#Other visualization packages
import seaborn as sns

#Importing NLP plugins
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer 
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Importing our Sklearn Plugins
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#importing our models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

#Model Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Part 1: Data Cleaning

In [None]:
df = pd.read_csv("../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
df.shape

In [None]:
df.head(3)

## Data Dicitonary
There are 17880 rows with 18 features.

|Column/Feature|Discription|
|---|---|
|job_id|Unique Job ID|
|title|The title of the job ad entry.|
|location|Geographical location of the job ad.|
|department|Corporate department (e.g. sales).|
|salary_range|Indicative salary range (e.g. $50,000-$60,000)|
|company_profile|A brief company description.|
|description|The details description of the job ad.|
|requirment|Enlisted requirements for the job opening.|
|benefits|Enlisted offered benefits by the employer.|
|telecommuting|True for telecommuting positions.|
|has_company_logo|True if company logo is present.|
|has_questions|True if screening questions are present.|
|employment_type|Full-type, Part-time, Contract, etc.|
|required_experience|Executive, Entry level, Intern, etc.|
|required_education|Doctorate, Master’s Degree, Bachelor, etc.|
|industry|Automotive, IT, Health care, Real estate, etc.|
|function|Consulting, Engineering, Research, Sales etc.|
|fradulent|target - Classification attribute.|

**Target Variable** = fradulent (1 or 0) with 1 being fradulent

In [None]:
#Checking our Data Types
df.info()

In [None]:
#Check percentage of data missing for each feature/column
df.isna().sum()/len(df)

In [None]:
#Checking for unique elements for each column
df.nunique()

Just from a quick glance my data, it seems that there is quite a few features that have a lot of missing rows. As such, **I will delete the various columns:**

1. job_id because my DataFrame already has a built in index. 
2. salary_range because around 84% of the data is missing
3. department because around 65% of the data is missing
4. benefits because 40% of the data is missing
5. company_profile because I want to combine the description + requirements columns to one features, in order to perform a tfidf vectorizer on it later on. 

The rest of the columns will be filled out in a methedolical order. 

In [None]:
# Always good practice to make a copy of your dataframe ever so often,
# so you can roll back your mistakes much easier than rerunning your whole kernal again.
df_2 = df.copy()
df_2 = df_2.drop(labels = ['job_id','salary_range',
                    'department','benefits',
                    'company_profile'], axis = 1) #axis = 1 to refer droping columns

In [None]:
df_2.tail(3)

Filling missing values for **employment_type**, **required_experience**, **required_education**, **industry**, **function** using the pandas bfill function. I did this because these features had the fewest unique elements for a non-binary feature.

> *Pandas bfill is a function that is used with the fillna function to back fill the values in a dataframe. Thus, if there is a NaN cell then bfill will replace that NaN value with the next row or column based on the axis equaling to 0 or 1.*

In [None]:
df_2['employment_type'] = df_2['employment_type'].bfill(axis=0)
df_2['required_experience'] = df_2['required_experience'].bfill(axis = 0)
df_2['required_education'] = df_2['required_education'].bfill(axis = 0)
df_2['industry'] = df_2['industry'].bfill(axis = 0)
df_2['function'] = df_2['function'].bfill(axis = 0)

Next step is to append the *description* column and *requirments* column together into one column. However, before I do this, **I want to avoid the NaN values in both of these columns.** In order to do so since there is a small number of missing rows in the description column, I will drop those rows first. From there, I will fill in all NaN values in the *requirments* column with " " aka. blank string. 

In addition I will drop duplicated description columns as well, prior to the great concatenation. 

In [None]:
# Make Dataframe copy
df_3 = df_2.copy()

# Keeping non NaN rows in my new dataframe
df_3 = df_3[df_3['description'].notna()]

# Replacing NaNs with an empty string.
#df_3 = df_3.replace(np.nan, '', regex = True)

In [None]:
# For good measure let's drop any other Nans 
df_3 = df_3.dropna(axis = 0, how = 'any')

In [None]:
print(f'We currenlty have {len(df_3)} rows. However, let\'s drop duplicates and compare.')

In [None]:
# drop duplicates
df_3 = df_3.drop_duplicates(keep = 'first')

In [None]:
df_3.isna().sum()/len(df)

In [None]:
print(f'After dropping duplicates we have {len(df_3)} rows left. It seems there were 178 duplicate rows.')

In [None]:
# Make copy
df_4 = df_3.copy()

#concatenating our description and requirments columns
df_4['description'] = df_4['description'] + ' ' + df_4['requirements']
del df_4['requirements']

In [None]:
#Clean DataFrame
df_clean = df_4.copy()

display(df_clean.head(7))
print(df_clean.shape)

# Part 2 - Exploratory Data Analysis

In [None]:
#Ploting the Target variable
plt.figure(figsize = (10,5))
sns.countplot(x = df.fraudulent, data = df,palette="Set3")
plt.title('Fradulent (Target Variable) Count')
plt.show()

It is clear that our data is highly imbalanced. This may cause some difficulties when modeling with highly imbalanced data. 

In [None]:
#Stylistic Set
sns.set(style="whitegrid")

plt.figure(figsize = (14,11))

#fig 1
plt.subplot(2,2,1)
sns.countplot(y = df.employment_type, data = df,palette="Set3", 
              order = df.employment_type.value_counts().index)
plt.title("Employment Type Count")
plt.ylabel("")

#fig2
plt.subplot(2,2,2)
#matplotlib version
#df.required_experience.value_counts().plot(kind='barh')
#sns version
sns.countplot(y = df.required_experience, data = df,palette="Set3",
             order = df.required_experience.value_counts().index)
plt.title("Required Experience Count")
plt.ylabel("")

#fig 3
plt.subplot(2,2,3)
sns.countplot(y = df.required_education, data = df,palette="Set3",
             order = df.required_education.value_counts().index)
plt.title("Required Education Count")
plt.ylabel("")

plt.tight_layout()
plt.show()

In [None]:
industry = df.industry.value_counts()[:10]
function = df.function.value_counts()[:10]

plt.figure(figsize = (12,12))

plt.subplot(2,1,1)
industry.plot(kind = 'barh')
plt.title('Top 10 Industries Represented in this Dataset.')
plt.xlabel('Count')

plt.subplot(2,1,2)
function.plot(kind = 'barh')
plt.title('Top 10 Business Functions Represented in this Dataset.')
plt.xlabel('Count')

plt.tight_layout()
plt.show()

### EDA Insights

* Most job offers were Full-time, followed by Contract work.

* Most jobs required an experience of mid-senior level, followed closely by Entry Level and Associate Level. Which is similar. 

* Most education experience required is a Bachelor’s Degree, with very few requiring Master’s Degree. Which signals that **work experience matters more** than education experience, and that the bachelor degree is a piece paper that proves you’ve done something. 

* In this dataset, The top 3 Industries were all tech related.

* The top 3 business functions were Information Technology, Sales, and Engineering.


#### Future Plots
1. A couple of word cloud images, people for some reason love world clouds.
2. Plot of a map, showing the counts of jobs for each country. etc.

# Part 3 - Feature Engineering & Modeling

We need to do some feature engineering. I would like to one hot encode my categorical data, as well as fit a TFIDF Vectorizer to my text data column. Might do a Count Vectorizer as well, and see if that changes anything to my model. In addition, I probably want to fit a PCA to reduce computational time. 

**Next Steps:**

1. One Hot Encode Cateogrical Data
2. Fit in a TFIDF Vectorizer
3. Fit in a Count Vectorizer
4. Determine if using a PCA would help. 

In [None]:
#Make Copy
df_5 = df_clean.copy()

# One Hot Encoding using Pandas get dummies function
columns_to_1_hot = ['employment_type','required_experience','required_education',
                   'industry', 'function']

for column in columns_to_1_hot:
    encoded = pd.get_dummies(df_5[column])
    df_5 = pd.concat([df_5, encoded], axis = 1)


In [None]:
columns_to_1_hot += ['title', 'location']
    
#droping the original columns that we just one hot encoded from
df_5 = df_5.drop(columns_to_1_hot, axis = 1)

In [None]:
df_5.head()

### Handling the description column 

First of all we need to clean up our text data a little bit. Now let us creat some helper funcitons.

In [None]:
def tokenizer(text):
    
    #All characters in this string will be converted to lowercase
    text = text.lower()
    
    #Removing sentence punctuations
    for punctuation_mark in string.punctuation:
        text = text.replace(punctuation_mark,'')
    
    #Creating our list of tokens
    list_of_tokens = text.split(' ')
    #Creating our cleaned tokens list 
    cleaned_tokens = []
    #Intatiating our Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    #Removing Stop Words in our list of tokens and any tokens that happens to be empty strings
    for token in list_of_tokens:
        if (not token in stop_words) and (token != ''):
            #lemmatizing our token
            token_lemmatized = lemmatizer.lemmatize(token)
            #appending our finalized cleaned token
            cleaned_tokens.append(token_lemmatized)
    
    return cleaned_tokens

## TfidfVectorizer

I will need to run a tfidf vectorizer on our description data, and append the results to our DataFrame. 

In [None]:
df_6 = df_5.copy()

#Instatiating our tfidf vectorizer
tfidf = TfidfVectorizer(tokenizer = tokenizer, min_df = 0.05, ngram_range=(1,3))
#Fit_transform our description 
tfidf_features = tfidf.fit_transform(df_6['description']) #this will create a sparse matrix

In [None]:
#I want to append this sparse matrix to the original pandas Dataframe
tfidf_vect_df = pd.DataFrame(tfidf_features.todense(), columns = tfidf.get_feature_names())

df_tfidf = pd.concat([df_6, tfidf_vect_df], axis = 1)

#Minor Cleaning steps after appending our tfidf results to our Dataframe, we will need to drop the description column. 
df_tfidf = df_tfidf.drop(['description'], axis = 1)
df_tfidf = df_tfidf.dropna()

In [None]:
df_tfidf.head(3)

## Count Vectorizer
Now let's do a similar procedure with a Count Vectorizer, so we can compare the two vectorizers in performance later on.

In [None]:
#Instatiating our CountVectorizer
count_vect = CountVectorizer(tokenizer = tokenizer, min_df = 0.05, ngram_range=(1,3))
#Fit_transform our description 
count_vect_features = count_vect.fit_transform(df_6['description']) #this will create a sparse matrix

count_vect_df = pd.DataFrame(count_vect_features.todense(), columns = count_vect.get_feature_names())

df_count_vect = pd.concat([df_6, count_vect_df], axis = 1)
df_count_vect = df_count_vect.drop(['description'], axis = 1)
df_count_vect = df_count_vect.dropna()

In [None]:
df_count_vect.head(3)

Great, we now have two different dataframes with two different vectorizers preprocessing our description data. I will hold out on the PCA to see if I need it. I will only do it if the modelimg takes too long. 

**I will conduct the following steps:**
1. Logistic Regression w/ Tfidf
2. Logistic Regression w/ Count Vectorizer
3. I will evaluate both models and determine which is better, and for simplicity stake pick the superior vectorizer for the other models I would like to run.

# Model 1 - Logistic Regresion w/ Tfidf

In [None]:
target = df_tfidf.fraudulent
features = df_tfidf.drop(['fraudulent'], axis = 1)

#Spliting our Data into train and holdout sets to test our models
X_train, X_hold, y_train, y_hold = train_test_split(features, target, test_size = 0.1,
                                                    stratify = target, random_state = 42)

In [None]:
#Intatiating our Logistic Regression Model
log_reg = LogisticRegression()
#I want to optimze the C-Value and penalty
c_values = [.00001, .0001, .001, .1, 1, 10, 100, 1000, 10000]
penalty_options = ['l1','l2']

param_grid = dict(C = c_values, penalty = penalty_options)

In [None]:
grid_tfidf = GridSearchCV(log_reg, param_grid= param_grid, cv = 10, scoring = 'roc_auc', n_jobs = -1)

In [None]:
grid_tfidf.fit(X_train, y_train)

In [None]:
print(grid_tfidf.best_score_)
print(grid_tfidf.best_params_)

In [None]:
log_reg_tfidf_pred = grid_tfidf.predict(X_hold)
print(roc_auc_score(y_hold, log_reg_tfidf_pred))
print(classification_report(y_hold, log_reg_tfidf_pred))

Interesting, using our holdout data our logistic regression with the tfidf data, had an AUC score of 0.58. Which is okay, that will be our baseline model. 

# Model 2 - Logistic Regression w/ Count Vectorizer

In [None]:
target_2 = df_count_vect.fraudulent
features_2 = df_count_vect.drop(['fraudulent'], axis = 1)

#Spliting our Data into train and holdout sets to test our models
X_train_2, X_hold_2, y_train_2, y_hold_2 = train_test_split(features_2, target_2, test_size = 0.1,
                                                    stratify = target_2, random_state = 42)

#Intiatiating our previous logistic regression model, using the count vectorizer dataset
grid_count_vect = GridSearchCV(log_reg, param_grid= param_grid, cv = 10, scoring = 'roc_auc', n_jobs = -1)

In [None]:
grid_count_vect.fit(X_train_2, y_train_2)
print(grid_count_vect.best_score_)
print(grid_count_vect.best_params_)

In [None]:
log_reg_pred_2 = grid_count_vect.predict(X_hold_2)
print(roc_auc_score(y_hold_2, log_reg_pred_2))
print(classification_report(y_hold_2, log_reg_pred_2))

The Count Vectorizer did not really improve from my previous model, it did worse by 3 percentage points. The AUC score on our holdout data was 0.55. Thus, I will stick using the tfidf data.

# Model 3 - KNearestNeighbors

In [None]:
# Model - KNearestNeighbors
knn = KNeighborsClassifier()

#The parameters we would like to optimize for
k_range = list(np.arange(2,23,2))
param_grid_knn = dict(n_neighbors=k_range)
print(param_grid_knn)

In [None]:
#Intatiate our knn gridsearch
grid_knn = GridSearchCV(knn, param_grid_knn, cv=10, scoring='roc_auc',
                        n_jobs = -1)

#Fit our grid_knn
grid_knn.fit(X_train, y_train)
print(grid_knn.best_score_)
print(grid_knn.best_params_)

In [None]:
#predicting on our holdout data
knn_pred = grid_knn.predict(X_hold)
#Printing out our evaluation metrics
print(roc_auc_score(y_hold, knn_pred))
print(classification_report(y_hold, knn_pred))

I'm a little bit dissapointed that the knn pediction on the holdout data was around the same too the original logistic regression.

# Model 4 - Support Vector Classification

In [None]:
#Intatiating our SVM model
svc = SVC(kernel = 'linear', gamma = 'auto' )

# I wont use a gridsearch because SVMs usually take a long looong time. I will just use a simple SVC
# and see how it plays out
svc.fit(X_train, y_train)

In [None]:
#predicting our holdout data
svc_pred = svc.predict(X_hold)

#Printing out our evaluation metrics
print(roc_auc_score(y_hold, svc_pred))
print(classification_report(y_hold, svc_pred))

Very Dissapointed that the SVC didn't do better either. Very dissapointed.

# Model 5 - Random Forest

In [None]:
#Instatiating our random forest

rf = RandomForestClassifier()

#The parameters we want to tune with our random forest
n_estimators_range = [1, 2, 4, 8, 16, 32, 64, 100, 200]

param_grid_rf = dict(n_estimators=n_estimators_range)

grid_rf = GridSearchCV(rf, param_grid_rf, cv=10, scoring='roc_auc',
                        n_jobs = -1)

In [None]:
grid_rf.fit(X_train, y_train)
print(grid_rf.best_score_)
print(grid_rf.best_params_)

In [None]:
rf_pred = grid_rf.predict(X_hold)
#Printing out our evaluation metrics
print(roc_auc_score(y_hold, rf_pred))
print(classification_report(y_hold, rf_pred))

Dissapointed.

# Model 6 - Neural Nets - MLPClassifier w/ solver = 'lbfgs' 

In [None]:
#Instatiatie our MLPClassifier
mlp = MLPClassifier(solver='lbfgs', 
                    activation = 'relu',
                   hidden_layer_sizes = (100,50,30), 
                    max_iter = 1000)

In [None]:
mlp.fit(X_train, y_train)

In [None]:
mlp_pred = mlp.predict(X_hold)

#Printing out our evaluation metrics
print(roc_auc_score(y_hold, mlp_pred))
print(classification_report(y_hold, mlp_pred))

Breath of fresh air! Finally a model that shows a **significant improvement from our baseline model.**

# Model 7 - Neural Nets - MLPClassifier w/ solver = 'adam'

In [None]:
#Instatiatie our MLPClassifier
mlp = MLPClassifier(solver='adam', 
                    activation = 'relu',
                   hidden_layer_sizes = (100,50,30), 
                    max_iter = 1000)

In [None]:
mlp.fit(X_train, y_train)

In [None]:
mlp_pred = mlp.predict(X_hold)

#Printing out our evaluation metrics
print(roc_auc_score(y_hold, mlp_pred))
print(classification_report(y_hold, mlp_pred))

Using the adam solver made our model perform even better! **an AUC score of 0.72! **