# Fake Job Description Prediction

## About Data:
This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent.

### Importing basic libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Importing dataset 

In [None]:
data = pd.read_csv("D:/dataforpython/fake_job_postings.csv")

### Overview of data

In [None]:
data.head()
data = data.drop(["job_id"],axis=1)      # <----- Droping job_id column as it is not required.

# Descriptive Statistics 
data.describe()

# Shape of the data
data.shape

# Info about data
data.info

### Creating copy of data for EDA

In [None]:
data1 = data.copy()

## EDA 
### Fake job postings based on employment type

In [None]:
data1['required_experience'] = data1['required_experience'].fillna(value='other')
plt.figure(figsize=(10,8))
sns.countplot(hue='fraudulent',y='required_experience',data=data1)
plt.title("Fake job postings based on Required Experience")
plt.show()

### Fake job postings based on required experience

In [None]:
data1['required_experience'] = data1['required_experience'].fillna(value='other')
plt.figure(figsize=(10,8))
sns.countplot(hue='fraudulent',y='required_experience',data=data1)
plt.title("Fake job postings based on Required Experience")
plt.show()

### Fake job postings based on Required Education

In [None]:
data1['required_education'] = data1['required_education'].fillna(value='other')
plt.figure(figsize=(10,8))
sns.countplot(hue='fraudulent',y='required_education',data=data1)
plt.title("Fake job postings based on Required Education")
plt.show()

### Fake job postings based on industry

In [None]:
data1['industry'] = data1['industry'].fillna(value='other')
plt.figure(figsize=(15,8))
sns.countplot(hue='fraudulent',y= 'industry',data=data1,order=data1.industry.value_counts().iloc[:20].index)
plt.title("Fake job postings based on Industry")
plt.show()

### NA values present in the data

In [None]:
data.isna().sum()

As salary_range column has 15012 NA records and also we cannot directly assume any salary range,
as different companies may have different salary range.
Dropping salary_range column.

In [None]:
data = data.drop(["salary_range"],axis=1)

### Joining every column containing strings(text) so that we can apply basic NLP techniques to convert data into int. Filling NA values with blanks i.e " "

In [None]:
data.fillna(" ",inplace=True)

In [None]:
# checking if there are any NA values
data.isna().sum()

### Joining String present in each column togethere except columns ("telecomputing", "has_comapny_logo", "has_questions", "fraudlent")

In [None]:
data["string"] = data['title'] + ' ' + data['location'] + ' ' + data['department'] + ' ' + data['company_profile'] + ' ' + data['description'] + ' ' + data['requirements'] + ' ' + data['benefits'] + ' ' + data['employment_type'] + ' ' + data['required_education'] + ' ' + data['industry'] + ' ' + data['function'] 

In [None]:
# sample of data["string"], to view if it is done correctly
data.string[0]

### After joining all column which contain text we can now drop the columns which we used to join.

In [None]:
data = data.drop(['title'],axis=1)
data = data.drop(['location'],axis=1)
data = data.drop(['department'],axis=1)
data = data.drop(['company_profile'],axis=1)
data = data.drop(['description'],axis=1)
data = data.drop(['requirements'],axis=1)
data = data.drop(['benefits'],axis=1)
data = data.drop(['employment_type'],axis=1)
data = data.drop(['required_experience'],axis=1)
data = data.drop(['required_education'],axis=1)
data = data.drop(['industry'],axis=1)
data = data.drop(['function'],axis=1)

In [None]:
# Overview of data
data.head()

### Checking if the data is imbalanced

In [None]:
data["fraudulent"].value_counts()

### Handling imbalance data (upscaling data)

In [None]:
from sklearn.utils import resample

df_majority = data[data['fraudulent'] == 0]
df_majority = df_majority.head(6000)
df_minority = data[data['fraudulent'] == 1]

upSample = resample(df_minority, replace=True, n_samples=6000, random_state=0)

data = pd.concat([df_majority, upSample])
data['fraudulent'].value_counts()

### Cleaning the text data from data["string"] column

In [None]:
# importing libraries for cleaning
import nltk
import re
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
# creating object of porterstemmer, WordNetLemmatizer & stopwords
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [None]:
# checking length of stop words
len(stopwords)

In [None]:
# Adding puntuation to set of stopwords
punctuation = list(string.punctuation)
stopwords.update(punctuation)

In [None]:
# checking length after adding puntuation to stopwords
len(stopwords)

In [None]:
# Resetting index as upscaling will give random numbers.
data = data.reset_index(drop=True)

In [None]:
# Cleaning text
doc = []
for i in range(0,data.shape[0]):
    text = str(data["string"][i])
    text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords]
    text = " ".join(text)
    print(text)
    doc.append(text)

In [None]:
# checking shape
print(data.shape[0])

In [None]:
# converting list to Pandas dataframe
data["string"] = pd.DataFrame(doc)

## Wordcloud

### Wordcloud where job_fraudulet = 0

In [None]:
doc_job_fraud_0 = data.loc[data["fraudulent"] == 0]
doc_job_fraud_0 = doc_job_fraud_0["string"]

# For wordcloud we need to convert list into string because list wont work with wordcloud
doc_job_fraud_0 = "".join(doc_job_fraud_0)

# Creating wordcloud will show most frequent occuring words
import wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=1000,height= 500,relative_scaling=1.0,max_words=3500,
                      background_color='black',
                      stopwords=stopwords,
                      min_font_size=10).generate(doc_job_fraud_0)

# plot the WordCloud image
plt.figure(figsize=(15,10), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()              # After executing this we will get wordcloud of most frequent words used in doc1 (for frequent words we will keep relative scaling =1)

### Wordcloud where job_fraudulet = 1

In [None]:
doc_job_fraud_1 = data.loc[data["fraudulent"] == 1]
doc_job_fraud_1 = doc_job_fraud_1["string"]

# For wordcloud we need to convert list into string because list wont work with wordcloud
doc_job_fraud_1 = "".join(doc_job_fraud_1)

# Creating wordcloud will show most frequent occuring words
import wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=1000,height= 500,relative_scaling=1.0,max_words=3500,
                      background_color='black',
                      stopwords=stopwords,
                      min_font_size=10).generate(doc_job_fraud_1)

# plot the WordCloud image
plt.figure(figsize=(15,10), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Creating Independent & Target variable

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=30000)
df = cv.fit_transform(doc).toarray()
df.shape

# converting to DataFrame for concatinating (optional)
x_df = pd.DataFrame(df)

# Independent variables
x = pd.concat([data.drop(["string","fraudulent"],axis=1),x_df],axis=1)
x.shape

# Target variable
y = data["fraudulent"]

### Spliting data into Train and Test (70:30)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=0)

### Training model with Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train,y_train)

### Predicting test set results

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(y_test,y_pred)
accuracy_score(y_test,y_pred)
print("Accuracy with MultinomialNB is "+str(accuracy_score(y_test,y_pred)))