In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers
!pip install tensorflow==2.1.0
!pip install simpletransformers
!pip install tokenizers==0.7.0

In [None]:
!export CUDA_HOME=/usr/local/cuda-10.1
!git clone https://github.com/NVIDIA/apex
%cd apex
!pip install -v --no-cache-dir ./

In [None]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
df.head()

Check for Missing Values

In [None]:
df.isnull().sum()

Check Real vs Fake class distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
print(df["fraudulent"].value_counts())
sns.barplot(df["fraudulent"].unique(), df["fraudulent"].value_counts())

We see that the data is highly imbalanced in favor of the real class

In [None]:
df.fillna(' ',inplace=True)

We will concatenate all the textual data that could have an impact that we have

In [None]:
df['text'] = df['title'] + " " + df['department'] + \
             " " + df['company_profile'] + " " + \
             df['description'] + " " + \
             df['requirements'] + " " +\
             df['benefits'] + " " +\
             df['function'] + " " \

Then we delete all the unnecessary columns

In [None]:
delete_list=['job_id','title','location','telecommuting','has_company_logo','has_questions','department','salary_range','company_profile','description','requirements','benefits','employment_type','required_experience','required_education','industry','function']

for val in delete_list:
    del df[val]
df.head()

In [None]:
import spacy, re
#Data Cleanup

df['text']=df['text'].str.replace('\n','')
df['text']=df['text'].str.replace('\r','')
df['text']=df['text'].str.replace('\t','')
  
#This removes unwanted texts
df['text'] = df['text'].apply(lambda x: re.sub(r'[0-9]','',x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]',' ',x))
  
#Converting all upper case to lower case
df['text']= df['text'].apply(lambda s:s.lower() if type(s) == str else s)
  

#Remove un necessary white space
df['text']=df['text'].str.replace('  ',' ')

#Remove Stop words
nlp=spacy.load("en_core_web_sm")
df['text'] =df['text'].apply(lambda x: ' '.join([word for word in x.split() if nlp.vocab[word].is_stop==False ]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df.fraudulent, test_size = 0.20, stratify=df.fraudulent, random_state=777)

train_df = pd.DataFrame({0: X_train, 1: y_train})
test_df = pd.DataFrame({0: X_test, 1: y_test})

In [None]:
from simpletransformers.classification import ClassificationModel


model = ClassificationModel('bert', 'bert-base-uncased', num_labels=2, args={'overwrite_output_dir': True, "train_batch_size": 64, "save_steps": 10000, "save_model_every_epoch":False,
                                                                           'num_train_epochs': 4}, use_cuda=True)

In [None]:
model.train_model(train_df)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)

In [None]:
import numpy as np
preds = [np.argmax(tuple(m)) for m in model_outputs]

In [None]:
from sklearn.metrics import f1_score

print(f1_score(test_df[1], preds, average='micro'))
print(f1_score(test_df[1], preds, average='macro'))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_df[1], preds))