In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
df = pd.read_csv('fake_job_postings.csv')

In [3]:
df['fraudulent'].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [5]:
# # Now lets see how many jobs posted are fraud and real.
# sns.countplot(df.fraudulent)
# df.groupby('fraudulent').count()['title'].reset_index().sort_values(by='title',ascending=False)

In [4]:
# Define the IT-related keywords
it_keywords = ['Computer Software', 'Information Technology and Services', 'Online Media', 'Internet', 'Telecommunications']

In [5]:
df.head(5)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [6]:
df.drop('job_id' , axis = 1 ,inplace = True)

In [7]:
df.head(2)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [8]:
# df['red_flags'] = df['title'] + " " + df['location'] + " " + df['department']+ " " + str(df['salary_range']) + " " + df['company_profile'] + " " + df['description'] + " " + df['requirements'] + " " + df['benefits'] + " " + df['employment_type'] + " " + df['required_experience'] + " " + df['required_education'] + " " + df['industry'] + " " + df['function']

In [9]:
df.isnull().sum()

title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [10]:
# Drop rows with missing values in the 'industry' column
df = df.dropna(subset=['industry'])

In [11]:
df['fraudulent'].value_counts()

fraudulent
0    12386
1      591
Name: count, dtype: int64

In [12]:
# df['location'] = df['location'].fillna(value = df['location'].value_counts().index[0] )

In [13]:
# Fill NaN values based on the 'fraudulent' column
df.loc[(df['location'].isna()) & (df['fraudulent'] == 0), 'location'] = df['location'].value_counts().index[0]
df.loc[(df['location'].isna()) & (df['fraudulent'] == 1), 'location'] = 'no available data'


In [14]:
df['department'].unique()

array(['Success', 'Sales', nan, ..., 'Programming', 'Admin - Clerical',
       'Administrative Dept'], dtype=object)

In [15]:
for idx in (df[df['department'].isna()]['title'].index) :
    if 'Marketing' in df.at[idx ,'title' ] :
        df.at[idx , 'department'] = 'Marketing'
    elif 'Sales' in df.at[idx ,'title' ] :
        df.at[idx , 'department'] = 'Sales'
    elif ('Accountant' in df.at[idx ,'title' ])|('Accounting' in df.at[idx ,'title' ] ) :
        df.at[idx , 'department'] = 'Accounting'
    elif ('Engineer' in df.at[idx ,'title' ] )|('Engineering' in df.at[idx ,'title' ] ) :
        df.at[idx , 'department'] = 'Engineering'
    else :
        df.at[idx , 'department'] = df.at[idx , 'title']
    

In [16]:
for idx in (df['salary_range'].dropna()).index :
    Range = df.at[idx , 'salary_range'].split('-')
    try :
        start = int(Range[0])
        if start < 1000 :
            df.at[idx ,'salary_range' ] = 0
        else :
            df.at[idx ,'salary_range' ] = start
            
    except ValueError :
        df.at[idx ,'salary_range' ] = 0
        
df['salary_range'] = df['salary_range'].fillna(0)

In [17]:
df.head(1)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [18]:
df[['company_profile' , 'description','requirements','benefits']] = df[['company_profile' , 'description','requirements','benefits']].fillna('no available data')

In [19]:
df[['industry' , 'function']] = df[['industry' , 'function']].fillna('')
df['job_field'] = df['title'] +' '+ df['department'] +' '+ df['industry'] +' '+ df['function']
df.drop(['title','department','industry','function'] , axis = 1 , inplace = True )

In [22]:
# Filter the dataset based on job titles containing IT-related keywords
# filtered_df = df[df['industry'].str.contains('|'.join(it_keywords), case=False)]

In [23]:
# Print the filtered dataset
# filtered_df

In [20]:
df[['employment_type','required_experience','required_education']] = df[['employment_type','required_experience','required_education']].fillna('Other')

In [21]:
df['Requirements'] = df['requirements'] +' ' + df['required_experience'] +' ' + df['required_education']
df.drop(['requirements','required_experience','required_education'],axis =  1 ,inplace = True )

In [22]:
# Mapping conversion
df['telecommuting'] = df['telecommuting'].map({1: 'has telecommuting', 0: "no telecommuting"})
df['has_company_logo'] = df['has_company_logo'].map({1: 'has company logo', 0: "no company logo"})
df['has_questions'] = df['has_questions'].map({1: 'has questions', 0: "no questions"})


In [23]:
filtered_df = df

In [24]:
#Checking the distribution of target class
filtered_df[filtered_df['fraudulent'] == 1].value_counts()

location               salary_range  company_profile                                                                                                                                                                                                                                                                                                                                                                                                                                                                        description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [29]:
# # Assuming you have a DataFrame named 'filtered_df'

# # Filter the DataFrame
# fraudulent_df = filtered_df[filtered_df['fraudulent'] == 1]

# # Specify the file path where you want to save the CSV file
# output_file = "fraudulent_jobs.csv"

# # Write the filtered DataFrame to a CSV file
# # fraudulent_df.to_csv(output_file, index=False)

# # Replace "fraudulent_jobs.csv" with the desired file name and path.


In [30]:
# df_f = pd.read_csv('fraudulent_jobs.csv')
# df_f['fraudulent']

In [25]:
filtered_df.isnull().sum()

location            0
salary_range        0
company_profile     0
description         0
benefits            0
telecommuting       0
has_company_logo    0
has_questions       0
employment_type     0
fraudulent          0
job_field           0
Requirements        0
dtype: int64

In [26]:
filtered_df.head()

Unnamed: 0,location,salary_range,company_profile,description,benefits,telecommuting,has_company_logo,has_questions,employment_type,fraudulent,job_field,Requirements
1,"NZ, , Auckland",0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What you will get from usThrough being part of...,no telecommuting,has company logo,no questions,Full-time,0,Customer Service - Cloud Video Production Succ...,What we expect from you:Your key responsibilit...
3,"US, DC, Washington",0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,Our culture is anything but corporate—we have ...,no telecommuting,has company logo,no questions,Full-time,0,Account Executive - Washington DC Sales Comput...,"EDUCATION: Bachelor’s or Master’s in GIS, busi..."
4,"US, FL, Fort Worth",0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,Full Benefits Offered,no telecommuting,has company logo,has questions,Full-time,0,Bill Review Manager Bill Review Manager Hospit...,QUALIFICATIONS:RN license in the State of Texa...
6,"DE, BE, Berlin",20000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Benefits: Being part of a fast-growing co...,no telecommuting,has company logo,has questions,Full-time,0,Head of Content (m/f) ANDROIDPIT Online Media ...,Your Know-How: ...
8,"US, FL, Pensacola",0,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,no available data,no telecommuting,has company logo,has questions,Full-time,0,HP BSM SME HP BSM SME Information Technology a...,MUST BE A US CITIZEN.An active TS/SCI clearanc...


In [33]:
# # Handle missing data
# filtered_df = filtered_df.fillna(' ')

In [27]:
columns = filtered_df.columns
column_data_types = filtered_df[columns].dtypes

print(column_data_types)

location            object
salary_range         int64
company_profile     object
description         object
benefits            object
telecommuting       object
has_company_logo    object
has_questions       object
employment_type     object
fraudulent           int64
job_field           object
Requirements        object
dtype: object


In [28]:
# Convert 'salary_range' column to object type
filtered_df['salary_range'] = filtered_df['salary_range'].astype('str')

filtered_df.dtypes


location            object
salary_range        object
company_profile     object
description         object
benefits            object
telecommuting       object
has_company_logo    object
has_questions       object
employment_type     object
fraudulent           int64
job_field           object
Requirements        object
dtype: object

In [29]:
# Combine relevant columns into a single 'text' column
filtered_df['text'] = filtered_df['location'] + " " + filtered_df['salary_range'] + " " + filtered_df['company_profile'] + " " + filtered_df['description'] + " " + filtered_df['benefits'] + " " + filtered_df['telecommuting'] + " " + filtered_df['has_company_logo'] + " " + filtered_df['has_questions'] + " " + filtered_df['employment_type'] + " " + filtered_df['job_field'] + " " + filtered_df['Requirements']

In [30]:
filtered_df[filtered_df['fraudulent'] == 1]['text'].iloc[5]

"US, OH,  0 no available data Apply below using link#URL_00962c0bdac3ecf40b2931cbb6493290c8712818f9b055eb964210d5ecab508a#SkyNet Managed Technology Services, a leading IT consulting firmbased in Columbus, Ohio, is seeking a dynamic sales professionalwho can hunt, identify opportunities, and sell technology solutionsand professional services. This is a consultative sales positiondealing directly with a variety of business contacts from C-Levelto Administrative. Experience in selling technology serviceswill give you the advantage.Responsibilities:Actively pursue new business opportunities through prospecting,cold calling, qualifying, scheduling appointments, andnetworkingMeet or exceed targeted monthly, quarterly, and annual salesactivity and sales revenue goalsMaintain the Customer Relationship Management (CRM) database ofclient and prospect informationManage activities, leads, opportunities, and salespipelineRequirements:Bachelor's degree or equivalent preferredMinimum 3 years consulta

In [31]:
filtered_df.head()

Unnamed: 0,location,salary_range,company_profile,description,benefits,telecommuting,has_company_logo,has_questions,employment_type,fraudulent,job_field,Requirements,text
1,"NZ, , Auckland",0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What you will get from usThrough being part of...,no telecommuting,has company logo,no questions,Full-time,0,Customer Service - Cloud Video Production Succ...,What we expect from you:Your key responsibilit...,"NZ, , Auckland 0 90 Seconds, the worlds Cloud ..."
3,"US, DC, Washington",0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,Our culture is anything but corporate—we have ...,no telecommuting,has company logo,no questions,Full-time,0,Account Executive - Washington DC Sales Comput...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...","US, DC, Washington 0 Our passion for improving..."
4,"US, FL, Fort Worth",0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,Full Benefits Offered,no telecommuting,has company logo,has questions,Full-time,0,Bill Review Manager Bill Review Manager Hospit...,QUALIFICATIONS:RN license in the State of Texa...,"US, FL, Fort Worth 0 SpotSource Solutions LLC ..."
6,"DE, BE, Berlin",20000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Benefits: Being part of a fast-growing co...,no telecommuting,has company logo,has questions,Full-time,0,Head of Content (m/f) ANDROIDPIT Online Media ...,Your Know-How: ...,"DE, BE, Berlin 20000 Founded in 2009, the Fonp..."
8,"US, FL, Pensacola",0,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,no available data,no telecommuting,has company logo,has questions,Full-time,0,HP BSM SME HP BSM SME Information Technology a...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,"US, FL, Pensacola 0 Solutions3 is a woman-owne..."


In [32]:
# Data Cleanup
import re # Import the 're' module for regular expressions
filtered_df['text'] = filtered_df['text'].str.replace('\n', ' ')
filtered_df['text'] = filtered_df['text'].str.replace('\r', ' ')
filtered_df['text'] = filtered_df['text'].str.replace('\t', ' ')
filtered_df['text'] = filtered_df['text'].apply(lambda x: re.sub(r'[0-9]', '', x))
filtered_df['text'] = filtered_df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]', ' ', x))
filtered_df['text'] = filtered_df['text'].apply(lambda s: s.lower() if type(s) == str else s)
filtered_df['text'] = filtered_df['text'].str.replace('  ', ' ')

In [33]:
# Remove Stop words
import spacy  # Import the 'spacy' library
nlp = spacy.load("en_core_web_sm")
filtered_df['text'] = filtered_df['text'].apply(lambda x: ' '.join([word for word in x.split() if nlp.vocab[word].is_stop == False]))

In [34]:
# Tokenize the text using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [35]:
filtered_df

Unnamed: 0,location,salary_range,company_profile,description,benefits,telecommuting,has_company_logo,has_questions,employment_type,fraudulent,job_field,Requirements,text
1,"NZ, , Auckland",0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What you will get from usThrough being part of...,no telecommuting,has company logo,no questions,Full-time,0,Customer Service - Cloud Video Production Succ...,What we expect from you:Your key responsibilit...,nz auckland seconds worlds cloud video product...
3,"US, DC, Washington",0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,Our culture is anything but corporate—we have ...,no telecommuting,has company logo,no questions,Full-time,0,Account Executive - Washington DC Sales Comput...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",dc washington passion improving quality life g...
4,"US, FL, Fort Worth",0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,Full Benefits Offered,no telecommuting,has company logo,has questions,Full-time,0,Bill Review Manager Bill Review Manager Hospit...,QUALIFICATIONS:RN license in the State of Texa...,fl fort worth spotsource solutions llc global ...
6,"DE, BE, Berlin",20000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Benefits: Being part of a fast-growing co...,no telecommuting,has company logo,has questions,Full-time,0,Head of Content (m/f) ANDROIDPIT Online Media ...,Your Know-How: ...,de berlin founded fonpit ag rose international...
8,"US, FL, Pensacola",0,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,no available data,no telecommuting,has company logo,has questions,Full-time,0,HP BSM SME HP BSM SME Information Technology a...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,fl pensacola solutions woman owned small busin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17874,"US, ,",80000,no available data,"Sr, JavaScript Developer Experience : 4-10 yea...",no available data,no telecommuting,no company logo,no questions,Full-time,0,JavaScript Developer JavaScript Developer Comp...,no available data Mid-Senior level Bachelor's ...,available data sr javascript developer experie...
17875,"CA, ON, Toronto",0,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,What can you expect from us?We have an open cu...,no telecommuting,has company logo,has questions,Full-time,0,Account Director - Distribution Sales Compute...,To ace this role you:Will eat comprehensive St...,toronto vend looking awesome new talent come j...
17876,"US, PA, Philadelphia",0,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,Health &amp; WellnessMedical planPrescription ...,no telecommuting,has company logo,has questions,Full-time,0,Payroll Accountant Accounting Internet Account...,- B.A. or B.S. in Accounting- Desire to have f...,pa philadelphia weblinc e commerce platform se...
17878,"NG, LA, Lagos",0,no available data,Nemsia Studios is looking for an experienced v...,Competitive salary (compensation will be based...,no telecommuting,no company logo,has questions,Contract,0,Graphic Designer Graphic Designer Graphic Desi...,1. Must be fluent in the latest versions of Co...,ng la lagos available data nemsia studios look...


In [36]:
filtered_df[filtered_df['fraudulent'] == 1]['text'].iloc[10]

'bakersfield jaco oil refined resources partnered effort streamline hiring process provide efficient effective recruitng model focus help develop achieve career goals makeing solid geographical cultural professional fiit leveraging career new exciting professional venture!please direct communications hr department refined resources #url_dedcababfedffafacbcdfeee# #email_bacfdcecbdabdecdbcadfbeacc#darren lawson vp recruiting #email_dfeedfcdddfcabcdbbdcbfadc# #phone_aadcbcadbebadafed# qualified candidates encouraged apply directly job posting direct email phone calls considered thank cooperation recruiters ust testing technician ii bakersfield local petroleum company operates primarily retail wholesale motor fuels related petroleum products seeking talented experienced maintenance technicians perform troubleshooting maintenance retail gasoline equipment responsibilitiesthe ideal candidate experience working retail gasoline dispensing peripheral equipment ust systems weights measures compl

In [37]:
filtered_df

Unnamed: 0,location,salary_range,company_profile,description,benefits,telecommuting,has_company_logo,has_questions,employment_type,fraudulent,job_field,Requirements,text
1,"NZ, , Auckland",0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What you will get from usThrough being part of...,no telecommuting,has company logo,no questions,Full-time,0,Customer Service - Cloud Video Production Succ...,What we expect from you:Your key responsibilit...,nz auckland seconds worlds cloud video product...
3,"US, DC, Washington",0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,Our culture is anything but corporate—we have ...,no telecommuting,has company logo,no questions,Full-time,0,Account Executive - Washington DC Sales Comput...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",dc washington passion improving quality life g...
4,"US, FL, Fort Worth",0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,Full Benefits Offered,no telecommuting,has company logo,has questions,Full-time,0,Bill Review Manager Bill Review Manager Hospit...,QUALIFICATIONS:RN license in the State of Texa...,fl fort worth spotsource solutions llc global ...
6,"DE, BE, Berlin",20000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Benefits: Being part of a fast-growing co...,no telecommuting,has company logo,has questions,Full-time,0,Head of Content (m/f) ANDROIDPIT Online Media ...,Your Know-How: ...,de berlin founded fonpit ag rose international...
8,"US, FL, Pensacola",0,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,no available data,no telecommuting,has company logo,has questions,Full-time,0,HP BSM SME HP BSM SME Information Technology a...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,fl pensacola solutions woman owned small busin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17874,"US, ,",80000,no available data,"Sr, JavaScript Developer Experience : 4-10 yea...",no available data,no telecommuting,no company logo,no questions,Full-time,0,JavaScript Developer JavaScript Developer Comp...,no available data Mid-Senior level Bachelor's ...,available data sr javascript developer experie...
17875,"CA, ON, Toronto",0,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,What can you expect from us?We have an open cu...,no telecommuting,has company logo,has questions,Full-time,0,Account Director - Distribution Sales Compute...,To ace this role you:Will eat comprehensive St...,toronto vend looking awesome new talent come j...
17876,"US, PA, Philadelphia",0,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,Health &amp; WellnessMedical planPrescription ...,no telecommuting,has company logo,has questions,Full-time,0,Payroll Accountant Accounting Internet Account...,- B.A. or B.S. in Accounting- Desire to have f...,pa philadelphia weblinc e commerce platform se...
17878,"NG, LA, Lagos",0,no available data,Nemsia Studios is looking for an experienced v...,Competitive salary (compensation will be based...,no telecommuting,no company logo,has questions,Contract,0,Graphic Designer Graphic Designer Graphic Desi...,1. Must be fluent in the latest versions of Co...,ng la lagos available data nemsia studios look...


In [38]:
# Encode the text data
X_encoded = tokenizer(filtered_df['text'].tolist(), truncation=True, padding=True, return_tensors='tf', max_length=128)

In [39]:
X_encoded

{'input_ids': <tf.Tensor: shape=(12977, 128), dtype=int32, numpy=
array([[  101, 20008,  8666, ...,  1035,  1042,   102],
       [  101,  5887,  2899, ..., 20566,  1047,   102],
       [  101, 13109,  3481, ...,  3791,  2449,   102],
       ...,
       [  101,  6643,  4407, ..., 10457, 21442,   102],
       [  101, 12835,  2474, ...,  3399,  5939,   102],
       [  101, 20008,  1050, ...,  2307,  4157,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(12977, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

In [40]:
import numpy as np

# Convert the TensorFlow tensor to a NumPy array
X_encoded_np = X_encoded.input_ids.numpy()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_np, filtered_df['fraudulent'], test_size=0.2, random_state=42)

In [41]:
# Create a DistilBERT model
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [42]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])



In [43]:
# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=3, batch_size=16)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [46]:
# import pickle

# with open('model.pkl', 'wb') as model_file:
#     pickle.dump(model, model_file)




In [52]:
# # Save model to directory
# save_directory = "model"
# model.save_pretrained(save_directory)


In [53]:
# loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

Some layers from the model checkpoint at model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# Evaluate the model
y_pred = model.predict(X_test)[0].argmax(axis=-1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2469
           1       0.97      0.72      0.82       127

    accuracy                           0.98      2596
   macro avg       0.98      0.86      0.91      2596
weighted avg       0.98      0.98      0.98      2596



In [55]:
def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
    return tkzr(x, max_length=max_len, truncation=trucation, padding=padding)

In [56]:
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

In [57]:
def create_predictor(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  
  def predict_proba(text):
      x = [text]

      encodings = construct_encodings(x, tkzr, max_len=max_len)
      tfdataset = construct_tfdataset(encodings)
      tfdataset = tfdataset.batch(1)

      preds = model.predict(tfdataset).logits
      preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
      return preds[0][0]
    
  return predict_proba




In [311]:
clf = create_predictor(model, 'distilbert-base-uncased', 128)

In [312]:
def classify_job(description):
    prediction = clf(description)
    if prediction > 0.5:
        return "real job"
    else:
        return "fake job"


In [60]:
# Sample real job posting text
sample_text_1= """
Title: Software Engineer
Company: ABC Tech
Location: San Francisco, CA
Description: We are looking for a highly skilled software engineer to join our team. You will be responsible for developing and maintaining cutting-edge software applications. The ideal candidate should have a strong background in software development, problem-solving skills, and a passion for technology.
Requirements: 
- Bachelor's degree in Computer Science or related field
- Proficiency in programming languages such as Python and Java
- Experience with web development frameworks
- Strong problem-solving skills
- Excellent teamwork and communication skills
Benefits: 
- Competitive salary and bonuses
- Health insurance
- Retirement plan
- Career growth opportunities
"""

In [61]:
# Sample fake job posting text
sample_text_2 = """
Title: High-Paying Work-From-Home Opportunity
Company: XYZ Solutions
Location: Anywhere
Description: Earn $10,000 per week from the comfort of your home! No experience required. Just send us $100 upfront to get started, and we'll make you rich in no time. Act fast, this offer won't last!
Requirements: None
Benefits: Get rich quick, work only 1 hour per day, unlimited vacation days
"""

In [62]:
# Sample fake job posting text
sample_text_4 = """
Work from home and earn $10,000 per week with our amazing opportunity! No experience required. Apply now at http://example.com.
"""

In [63]:
# Sample real job posting text
sample_text_3 = """
We are looking for a highly motivated and skilled software developer to join our team. The ideal candidate will have a strong background in computer science, experience with Python, and a passion for solving complex problems. This is a full-time position with a competitive salary and benefits package. If you're interested, please send your resume to careers@example.com.
"""

In [395]:
import openai
import pandas as pd

# Initialize OpenAI API
openai.api_key = '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'  # Ensure you never expose your API key in public forums

# Set up the messages for the chat-based model
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f'Analyze the following job-related tweet and extract details: "{sample_text_4}"'},
    {"role": "user", "content": "Extract the following information: Location, Salary Range (float), Company Profile (with minimum words), Job Description (with minimum words), Benefits, Telecommuting (yes/no), Has Company Logo (yes/no), Has Questions (yes/no), Employment Type, Fraudulent, Job Field, Requirements."}
]

# Call the OpenAI API
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages
)

# Extract the assistant's reply from the response
assistant_reply = response.choices[0].message['content']

# You can then parse the assistant's reply to extract the required fields and integrate them into a DataFrame

# ... [Your parsing and DataFrame integration code here]

print(assistant_reply)


Information extracted from the tweet:
- Location: Work from home
- Salary Range: $10,000 per week (float)
- Company Profile: Not provided in the tweet
- Job Description: Not provided in the tweet
- Benefits: Not mentioned in the tweet
- Telecommuting: Yes, as mentioned in "Work from home"
- Has Company Logo: Not mentioned in the tweet
- Has Questions: Not mentioned in the tweet
- Employment Type: Not mentioned in the tweet
- Fraudulent: Not determined from the tweet
- Job Field: Not provided in the tweet
- Requirements: No experience required, as mentioned in the tweet


In [396]:
import re
import pandas as pd

# Define regular expressions for each field
regexes = {
    'location': r"Location: (.+?)\n",
    'salary_range': r"Salary Range: (.+?)\n",
    'company_profile': r"Company Profile: (.+?)\n",
    'description': r"Job Description: (.+?)\n",
    'benefits': r"Benefits: (.+?)\n",
    'telecommuting': r"Telecommuting: (.+?)\n",
    'has_company_logo': r"Has Company Logo: (.+?)\n",
    'has_questions': r"Has Questions: (.+?)\n",
    'employment_type': r"Employment Type: (.+?)\n",
    'job_field': r"Job Field: (.+?)\n",
    'Requirements': r"Requirements: (.+)"
}

# Sample data
test_data = [assistant_reply]

# Extract data using regular expressions
data_list = []
for test in test_data:
    test_str = str(test)  # Convert dictionary to string
    data = {}
    for key, regex in regexes.items():
        match = re.search(regex, test_str)
        data[key] = match.group(1) if match else "Not specified"
    data_list.append(data)

# Convert the extracted data into a DataFrame
df = pd.DataFrame(data_list)

print(df)


         location              salary_range            company_profile  \
0  Work from home  $10,000 per week (float)  Not provided in the tweet   

                 description                    benefits  \
0  Not provided in the tweet  Not mentioned in the tweet   

                           telecommuting            has_company_logo  \
0  Yes, as mentioned in "Work from home"  Not mentioned in the tweet   

                has_questions             employment_type  \
0  Not mentioned in the tweet  Not mentioned in the tweet   

                   job_field  \
0  Not provided in the tweet   

                                        Requirements  
0  No experience required, as mentioned in the tweet  


In [397]:
# Extract the first numerical value from each row in the 'salary_range' column
df['salary_range'] = df['salary_range'].str.extract(r"(\d+[\d,\.]*\d*)")

# Convert the extracted values to integers
df['salary_range'] = df['salary_range'].str.replace(',', '').astype(float).fillna(0).astype(int)


In [398]:
# Fill NaN values based on the 'fraudulent' column
df.loc[df['location'].str.contains('Not|None', case=False, na=False),'location'] = 'no available data'

In [399]:
columns_to_check = ['company_profile', 'description', 'Requirements', 'benefits','job_field']

for col in columns_to_check:
    df.loc[df[col].str.contains('Not|None', case=False, na=False), col] = 'no available data'


In [400]:
df.loc[df['telecommuting'].str.contains('Yes', case=False, na=False), 'telecommuting'] = 'has telecommuting'
df.loc[df['telecommuting'].str.contains('Not|None', case=False, na=False), 'telecommuting'] = "hasn't telecommuting"

In [401]:
df.loc[df['has_company_logo'].str.contains('Yes', case=False, na=False), 'has_company_logo'] = 'has company logo'
df.loc[df['has_company_logo'].str.contains('Not|None', case=False, na=False), 'has_company_logo'] = "hasn't company logo"

df.loc[df['has_questions'].str.contains('Yes', case=False, na=False), 'has_questions'] = 'has questions'
df.loc[df['has_questions'].str.contains('Not|None', case=False, na=False), 'has_questions'] = "hasn't questions"

In [402]:
df.loc[df['employment_type'].str.contains('Not|None', case=False, na=False), 'employment_type'] = "other"

In [403]:
df.loc[df['job_field'].str.contains('Not|None|No', case=False, na=False), 'job_field'] = "other"

In [404]:
# Convert 'salary_range' column to object type
df['salary_range'] = df['salary_range'].astype('str')

df.dtypes

location            object
salary_range        object
company_profile     object
description         object
benefits            object
telecommuting       object
has_company_logo    object
has_questions       object
employment_type     object
job_field           object
Requirements        object
dtype: object

In [405]:
# Combine relevant columns into a single 'text' column
df['text'] = df['location'] + " " + df['salary_range'] + " " + df['company_profile'] + " " + df['description'] + " " + df['benefits'] + " " + df['telecommuting'] + " " + df['has_company_logo'] + " " + df['has_questions'] + " " + df['employment_type'] + " " + df['job_field'] + " " + df['Requirements']

In [406]:
result = classify_job(df['text'].iloc[0])
print(result)


real job


## New Approach

In [407]:
import openai

# Initialize the OpenAI API with your API key
openai.api_key = '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'

def detect_red_flags_gpt3_turbo(job_description):
    # Designing the conversation for GPT-3.5-turbo
    conversation = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant that identifies red flags in job descriptions."},
            {"role": "user", "content": f"Given the job description: '{job_description}', does this contain red flags that might related to fake job posting? (yes/no)"}
        ]
    }

    # Getting response from GPT-3.5-turbo
    response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=conversation["messages"], max_tokens=150)

    # Parsing the response
    answer = response.choices[0].message['content'].strip()

    return answer.lower()

    # # Based on the response, decide if it's fake or real
    # if "yes" in answer.lower():
    #     return "Potential fake job"
    # else:
    #     return "Likely real job"

# Test the function
# sample_text = "High-Paying Work-From-Home Opportunity. Earn $10,000 per week! No experience required. Send $100 to join."
# result = detect_red_flags_gpt3_turbo(filtered_df[filtered_df['fraudulent'] == 1]['text'].iloc[5])
result = detect_red_flags_gpt3_turbo("Work from home and earn $10,000 per week with our amazing opportunity! No experience required. Apply now at http://example.com.")
print(result)


yes


In [408]:
def job_cls(input_test):
    if detect_red_flags_gpt3_turbo(input_test):
        return("yes")
    else:
        return classify_job(input_test)

In [409]:
print(job_cls(sample_text_1))

yes


In [410]:
# print(clf("stocton staffing &amp recruiting right oil &amp energy industry!represented candidates automatically granted following perks expert negotiations behalf maximizing compensation package implimenting ongoing increases significant signing bonus refined resources addition potential signing bonuses client companies offer year access anyperk significant corporate discounts cell phones event tickets house cleaning inbetween you'll save thousands daily expenditures professional relocation services town candidates* candidates encouraged participate referral bonus program ranging $ $ successfully hired candidates referred directly refined resources teamplease submit referrals online referral formthank look forward working soon! click enlarge image ic&amp e technician bakersfield mt posoprincipal duties responsibilities calibrates tests maintains troubleshoots installs power plant instrumentation control systems electrical equipment performs maintenance motor control centers motor operated valves generators excitation equipment motors performs preventive predictive corrective maintenance equipment coordinating work team members designs installs new equipment system modifications troubleshoots performs maintenance dc backup power equipment process controls programmable logic controls plc emission monitoring equipment uses maintenance reporting system record time material use problem identified corrected action required provides complete history maintenance equipment schedule coordinate work monitor contractors specific tasks required follows safe working practices times identifies safety hazards recommends solutions follows environmental compliance work practices identifies environmental non compliance problems assist implementing solutions assists team members works departments support generating station achieving performance goals trains team members areas instrumentation control electrical systems performs housekeeping assignments directed conduct equipment system tagging according company plant rules regulations perform equipment safety inspections required record results appropriate participate small construction projects read interpret drawings sketches prints specifications required orders parts needed affect maintenance repair performs operations tasks needed basis tasks assigned available reasonable response time emergency ins overtime plus provide acceptable hour contact phone company pager excellent verbal written communications skills ability coordinate work activities team members technical subjects job families ability work weekends holidays rotating shifts required benefitswhat offered competitive compensation package% matched retirement fundannual vacations paid companysignificant bonus structureopportunity advancement benefits packageannual performance reviews base salary increasesannual cost living increases sound clean safe enjoyable working environment &amp company cultureworld renound management executive team promote leverage careers invest employees long term success careers overall company employee goalsqualified candidates contact darren lawson vp recruiting #email_dfeedfcdddfcabcdbbdcbfadc# #phone_aadcbcadbebadafed# telecommuting company logo questions time ic&e technician oil & energy oil & energy qualificationsknowledge skills &amp abilities high school diploma ged required valid driver’s license ability read write communicate effectively english good math skills years experience i&amp c technician electrician power plant environment preferably strong electrical background including voltages kv provide following demonstrated knowledge electrical equipment electronics schematics basics chemistry physics controls instrumentation demonstrated knowledge safe work practices associated power plant environment demonstrated ability calibrate i&amp c systems equipment including analytic equipment demonstrated ability configure operate test instruments equipment necessary troubleshoot repair plant equipment including limited distributed control systems programmable logic controllers motor control centers transformers generators continuous emissions monitor cem systems demonstrated ability work team environment mid senior level high school equivalent"
# # ))

In [411]:
# print(clf("no available data 0 The company is looking for a highly motivated and skilled software developer to join their team. The ideal candidate will have a strong background in computer science, experience with Python, and a passion for solving complex problems. no available data hasn't telecommuting hasn't company logo hasn't questions Full-time. Software development. Strong background in computer science, experience with Python, and a passion for solving complex problems."))

In [412]:
# Save the model in TensorFlow SavedModel format
# model.save('model/distilbert_fake_job_detection', save_format='tf')

In [413]:
# # Sample fake job posting text
# sample_text_4 = """
# Work from home and earn $10,000 per week with our amazing opportunity! No experience required. Apply now at http://example.com.
# """

In [414]:
# # Sample real job posting text
# sample_text_3 = """
# We are looking for a highly motivated and skilled software developer to join our team. The ideal candidate will have a strong background in computer science, experience with Python, and a passion for solving complex problems. This is a full-time position with a competitive salary and benefits package. If you're interested, please send your resume to careers@example.com.
# """

In [415]:
# import openai
# import pandas as pd

# # Initialize OpenAI API
# openai.api_key = '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'  # Ensure you never expose your API key in public forums

# # Set up the messages for the chat-based model
# messages = [
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": f'Analyze the following job-related tweet and extract details: "{sample_text_4}"'},
#     {"role": "user", "content": "Extract the following information: Location, Salary Range (float), Company Profile (with minimum words), Job Description (with minimum words), Benefits, Telecommuting (yes/no), Has Company Logo (yes/no), Has Questions (yes/no), Employment Type, Fraudulent, Job Field, Requirements."}
# ]

# # Call the OpenAI API
# response = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=messages
# )

# # Extract the assistant's reply from the response
# assistant_reply = response.choices[0].message['content']

# # You can then parse the assistant's reply to extract the required fields and integrate them into a DataFrame

# # ... [Your parsing and DataFrame integration code here]

# print(assistant_reply)

In [416]:
# import re
# import pandas as pd

# # Define regular expressions for each field
# regexes = {
#     'location': r"Location: (.+?)\n",
#     'salary_range': r"Salary Range: (.+?)\n",
#     'company_profile': r"Company Profile: (.+?)\n",
#     'description': r"Job Description: (.+?)\n",
#     'benefits': r"Benefits: (.+?)\n",
#     'telecommuting': r"Telecommuting: (.+?)\n",
#     'has_company_logo': r"Has Company Logo: (.+?)\n",
#     'has_questions': r"Has Questions: (.+?)\n",
#     'employment_type': r"Employment Type: (.+?)\n",
#     'job_field': r"Job Field: (.+?)\n",
#     'Requirements': r"Requirements: (.+)"
# }

# # Sample data
# test_data = [assistant_reply]

# # Extract data using regular expressions
# data_list = []
# for test in test_data:
#     test_str = str(test)  # Convert dictionary to string
#     data = {}
#     for key, regex in regexes.items():
#         match = re.search(regex, test_str)
#         data[key] = match.group(1) if match else "Not specified"
#     data_list.append(data)

# # Convert the extracted data into a DataFrame
# df_new = pd.DataFrame(data_list)

# print(df_new)

In [417]:
# # Extract the first numerical value from each row in the 'salary_range' column
# df_new['salary_range'] = df_new['salary_range'].str.extract(r"(\d+[\d,\.]*\d*)")

# # Convert the extracted values to integers
# df_new['salary_range'] = df_new['salary_range'].str.replace(',', '').astype(float).fillna(0).astype(int)

In [418]:
# df_new

In [419]:
# filtered_df.head(1)

In [420]:
# columns_to_check = ['company_profile', 'description', 'Requirements', 'benefits','job_field']

# for col in columns_to_check:
#     df_new.loc[df_new[col].str.contains('Not|None', case=False, na=False), col] = 'no available data'

In [421]:
# df_new

In [422]:
# # First loop: Replace 'Not' and 'No' with 0
# for col in df_new.columns:
#     if df_new[col].dtype == 'object' and all(isinstance(x, (str, type(None))) for x in df_new[col]):
#         df_new.loc[df_new[col].str.contains('Not', case=False, na=False), col] = 0

# # Second loop: Replace 'Yes' with 1
# # for col in df.columns:
# #     if df[col].dtype == 'object' and all(isinstance(x, (str, type(None))) for x in df[col]):
# #         df.loc[df[col].str.contains('Yes', case=False, na=False), col] = 1

# for col in ['telecommuting','has_company_logo','has_questions']:
#     if df_new[col].dtype == 'object' and all(isinstance(x, (str, type(None))) for x in df_new[col]):
#         df_new.loc[df_new[col].str.contains('No', case=False, na=False), col] = 0

In [423]:
# # Second loop: Replace 'Yes' with 1
# for col in ['telecommuting','has_company_logo','has_questions']:
#     if df_new[col].dtype == 'object' and all(isinstance(x, (str, type(None))) for x in df_new[col]):
#         df_new.loc[df_new[col].str.contains('Yes', case=False, na=False), col] = 1

In [424]:
# df_new

In [425]:
# import matplotlib.pyplot as plt
# # Plot accuracy graph
# plt.figure(figsize=(8, 6))
# plt.plot(history.history['accuracy'], label='Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.show()

In [426]:
# # Load the trained DistilBERT model
# def load_distilbert_model(model_path):
#     model = tf.keras.models.load_model(model_path)
#     return model

In [427]:
# # Load the DistilBERT tokenizer
# def load_distilbert_tokenizer():
#     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#     return tokenizer

In [428]:
# import numpy as np

# def predict_fake_job(text, loaded_model, tokenizer):
#     encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
#     encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}
#     prediction = loaded_model.predict(encoded_text_np)
#     predicted_label = np.argmax(prediction, axis=-1)
#     return predicted_label


In [429]:
# # Load the trained DistilBERT model
# model_path = 'model/distilbert_fake_job_detection'
# loaded_model = load_distilbert_model(model_path)

# # Load the DistilBERT tokenizer
# tokenizer = load_distilbert_tokenizer()

# # Example usage
# input_text = "Looking for a full-time job with competitive salary. No experience required, work from home, ern $10,000 per week! Software developer position at Virtusa Inc. Get rich quick with our amazing opportunty!"
# predicted_label = predict_fake_job(input_text, loaded_model, tokenizer)
# print('label ', predicted_label)
# if predicted_label == 0:
#     prediction_result = "Real Job"
# else:
#     prediction_result = "Fake Job"

# print("Prediction:", prediction_result)


In [430]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
# import re
# from spellchecker import SpellChecker  # For spell checking

# # Load the trained DistilBERT model
# model_path = 'model/distilbert_fake_job_detection'
# model = load_distilbert_model(model_path)

# # Load your testing data or create sample data
# # Replace 'test_data' with your actual test data
# test_data = [
#     "Looking for a full-time job with competitive salary.",
#     "No experience required, work from home, ern $10,000 per week!",
#     "Software developer position at XYZ Inc.",
#     "Get rich quick with our amazing opportunty!"
# ]

# # Define a list of suspicious phrases and words commonly used in fake job postings
# suspicious_keywords = ["work from home", "earn $", "get rich quick", "no experience required", "amazing opportunity"]

# # Tokenize and encode the testing data
# encoded_texts = tokenizer(test_data, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_texts_np = {k: v.numpy() for k, v in encoded_texts.items()}

# # Make predictions
# predictions = model.predict(encoded_texts_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_labels = np.argmax(predictions['logits'], axis=1)

# # Map predicted labels to human-readable categories
# predicted_categories = ["Real Job" if label == 0 else "Fake Job" for label in predicted_labels]

# # Spell checker
# spell_checker = SpellChecker()

# # Identify fake job postings based on suspicious keywords and spelling mistakes
# for i, text in enumerate(test_data):
#     is_fake = any(keyword in text.lower() for keyword in suspicious_keywords)
    
#     # Check for spelling mistakes
#     words = re.findall(r'\b\w+\b', text.lower())  # Tokenize the text into words
#     misspelled_words = spell_checker.unknown(words)
    
#     if misspelled_words:
#         is_fake = True
    
#     if is_fake:
#         predicted_categories[i] = "Fake Job"

# # Display the results
# for text, category in zip(test_data, predicted_categories):
#     print(f"Text: {text}\nPrediction: {category}\n")



In [431]:
# import numpy as np
# import pandas as pd
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
# import re
# from spellchecker import SpellChecker  # For spell checking

# # Load the pretrained DistilBERT model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# # Define a list of suspicious phrases and words commonly used in fake job postings
# suspicious_keywords = [
#     "work from home", "earn $", "get rich quick", "no experience required", "amazing opportunity",
#     "contact us", "apply now", "click here", "call now", "limited time offer"
# ]

# # Tokenize and encode the testing data
# def preprocess_text(text):
#     # Check for contact details (phone numbers, emails)
#     contact_details = re.findall(r'\b(?:\d{10}|\w+@\w+\.\w+)\b', text)
    
#     # Check for URLs
#     urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    
#     # Check for spelling mistakes
#     words = re.findall(r'\b\w+\b', text.lower())  # Tokenize the text into words
#     spell_checker = SpellChecker()
#     misspelled_words = spell_checker.unknown(words)
    
#     return contact_details, urls, misspelled_words

# # Example input paragraph
# input_text = """
# We are looking for a highly motivated and skilled software developer to join our team. The ideal candidate will have a strong background in computer science, experience with Python, and a passion for solving complex problems. This is a full-time position with a competitive salary and benefits package. If you're interested, please send your resume to careers@example.com.
# Apply now: http://example.com/apply
# """

# # Tokenize and encode the testing data
# encoded_text = tokenizer(input_text, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}

# # Make predictions
# predictions = model.predict(encoded_text_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_label = np.argmax(predictions['logits'], axis=-1)

# # Map predicted label to human-readable category
# predicted_category = "Real Job" if predicted_label == 0 else "Fake Job"

# # Process the text to check for additional suspicious elements
# contact_details, urls, misspelled_words = preprocess_text(input_text)

# # Check for missing contact details
# missing_contact_details = "Missing" if not contact_details else "Found"

# # Check for verified URLs
# verified_urls = ["Verified" if url.startswith("http://") or url.startswith("https://") else "Not Verified" for url in urls]

# # Display the results
# print(f"Prediction: {predicted_category}")
# print(f"Missing Contact Details: {missing_contact_details}")
# print(f"Verified URLs: {', '.join(verified_urls)}")
# print(f"Misspelled Words: {', '.join(misspelled_words)}")


In [432]:
# import numpy as np
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# # Load the pretrained DistilBERT model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# # Sample real job posting text
# sample_text = """
# We are looking for a highly motivated and skilled software developer to join our team. The ideal candidate will have a strong background in computer science, experience with Python, and a passion for solving complex problems. This is a full-time position with a competitive salary and benefits package. If you're interested, please send your resume to careers@example.com.
# """

# # Tokenize and encode the sample text
# encoded_text = tokenizer(sample_text, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}

# # Make predictions
# predictions = model.predict(encoded_text_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_label = np.argmax(predictions['logits'], axis=-1)

# # Map predicted label to human-readable category
# predicted_category = "Real Job" if predicted_label == 0 else "Fake Job"

# # Display the results
# print(f"Prediction: {predicted_category}")


In [433]:
# import numpy as np
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# # Load the pretrained DistilBERT model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# # Sample real job posting text
# sample_text = """
# Work from home and earn $10,000 per week with our amazing opportunity! No experience required. Apply now at http://example.com.
# """

# # Tokenize and encode the sample text
# encoded_text = tokenizer(sample_text, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}

# # Make predictions
# predictions = model.predict(encoded_text_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_label = np.argmax(predictions['logits'], axis=-1)

# # Map predicted label to human-readable category
# predicted_category = "Real Job" if predicted_label == 0 else "Fake Job"

# # Display the results
# print(f"Prediction: {predicted_category}")


In [434]:
# import numpy as np
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# # Load the pretrained DistilBERT model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# # Sample real job posting text
# sample_text = """
# Title: Software Engineer
# Company: ABC Tech
# Location: San Francisco, CA
# Description: We are looking for a highly skilled software engineer to join our team. You will be responsible for developing and maintaining cutting-edge software applications. The ideal candidate should have a strong background in software development, problem-solving skills, and a passion for technology.
# Requirements: 
# - Bachelor's degree in Computer Science or related field
# - Proficiency in programming languages such as Python and Java
# - Experience with web development frameworks
# - Strong problem-solving skills
# - Excellent teamwork and communication skills
# Benefits: 
# - Competitive salary and bonuses
# - Health insurance
# - Retirement plan
# - Career growth opportunities

# """

# # Tokenize and encode the sample text
# encoded_text = tokenizer(sample_text, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}

# # Make predictions
# predictions = model.predict(encoded_text_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_label = np.argmax(predictions['logits'], axis=-1)

# # Map predicted label to human-readable category
# predicted_category = "Real Job" if predicted_label == 0 else "Fake Job"

# # Display the results
# print("Input Text:")
# print(sample_text)
# print(f"Prediction: {predicted_category}")


In [435]:
# import numpy as np
# import tensorflow as tf
# from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# # Load the pretrained DistilBERT model and tokenizer
# model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# # Sample real job posting text
# sample_text = """
# Title: High-Paying Work-From-Home Opportunity
# Company: XYZ Solutions
# Location: Anywhere
# Description: Earn $10,000 per week from the comfort of your home! No experience required. Just send us $100 upfront to get started, and we'll make you rich in no time. Act fast, this offer won't last!
# Requirements: None
# Benefits: Get rich quick, work only 1 hour per day, unlimited vacation days
# """

# # Tokenize and encode the sample text
# encoded_text = tokenizer(sample_text, truncation=True, padding=True, return_tensors="tf", max_length=128)

# # Convert the TensorFlow tensor to a NumPy array
# encoded_text_np = {k: v.numpy() for k, v in encoded_text.items()}

# # Make predictions
# predictions = model.predict(encoded_text_np)

# # Get predicted labels (0 for real job, 1 for fake job)
# predicted_label = np.argmax(predictions['logits'], axis=-1)

# # Map predicted label to human-readable category
# predicted_category = "Real Job" if predicted_label == 0 else "Fake Job"

# # Display the results
# print("Input Text:")
# print(sample_text)
# print(f"Prediction: {predicted_category}")
