<a href="https://www.kaggle.com/code/royremo/fake-job-detection-using-sentiment-analysis?scriptVersionId=160483531" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #007fd4">Fake Job Detection using NLP</p>

##  <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Goal: Detecting Fake Job Postings posted in the different job portals</p>

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Import Libraries </p>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
import re
import string

from collections import Counter
from nltk.corpus import stopwords

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from nltk.corpus import opinion_lexicon
from concurrent.futures import ProcessPoolExecutor

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix,roc_curve,auc,roc_auc_score, ConfusionMatrixDisplay)

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Load Data from the Source </p>

In [None]:
df_raw = pd.read_csv("/kaggle/input/model-data/fake_job_postings - existing.csv")
print(f"Raw Data Shape: {df_raw.shape}")
df_raw.head()

In [None]:
df = df_raw.copy()

print(f"Shape: {df.shape}")

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Basic Data Exploration </p>

In [None]:
df.info()

In [None]:
df.describe(include = "all")

In [None]:
df.describe(exclude="object")

In [None]:
df.describe(exclude=np.number)

There are many missing feature values, we will perform futhur checks

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;">Data Checks to perform </p>
 - Check Missing values
 - Check Duplicates
 - Check data type
 - Check the number of unique values of each column
 - Check statistics of data set
 - Check various categories present in the different categorical column

 #### Missing values

In [None]:
df.isnull().sum()

In [None]:
## Here we will check the percentage of nan values present in each feature
feature_na = [feature for feature in df.columns if df[feature].isnull().sum() > 0 ]

for feature in feature_na:
    print(feature, np.round(df[feature].isnull().mean() * 100,4), " % missing")

In [None]:
# Null valuse more than 10%
columns_with_null_values_total = (df.isnull().sum() / len(df)) * 100
columns_with_null_values_percentage = columns_with_null_values_total[columns_with_null_values_total > 10].sort_values(ascending=False)

columns_with_null_values_percentage

<b>observation:</b> There are many missing feature values. There 8 features with ~85% values are null and 2 features with ~65% null.

The feature with 50% null values can be removed from the dataset

In [None]:
df.drop(columns_with_null_values_percentage.keys().tolist(),axis=1,inplace=True)

df.head()

In [None]:
df.columns

In [None]:
df.drop(columns=['job_id','telecommuting','has_company_logo','has_questions'],inplace = True)

In [None]:
df.rename(columns={'title':'role','description':'job_description'},inplace=True)

In [None]:
df.head()

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Duplicate Values Check </p>

In [None]:
df.duplicated().sum()

In [None]:
print("***** Duplicate check Feature wise *****\n")

# Check Job description duplications
print("job_description: ", df["job_description"].duplicated().sum())

There are duplicated records, this has to taken care during preprocessing stage

In [None]:
#Sorting data according to text in ascending order
sorted_data=df.sort_values('job_description', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

sorted_data[sorted_data["job_description"].duplicated()]

In [None]:
df.shape

#### Check the Feature Uniqueness

In [None]:
for feature in df.columns:
    print('{}: Unique Count: {}\n {}\n'.format(feature, len(df[feature].unique()), df[feature].unique()))

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Data preprocessing </p>

<b>Handle Missing Values</b>

In [None]:
df.isnull().sum()

In [None]:
# Fill na values with missing

for feature in df.columns:
  if(df[feature].isnull().sum()>0):
    df[feature].fillna("missing", inplace=True)

df.isnull().sum()

<b>Deduplication of data</b>

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(keep="first", inplace= True)
print("Shape: ", df.shape)

df.duplicated().sum()

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Exploratory Data Analysis </p>

#### Description: Analyze the count of Charecters, Words, and Sentences in each segment

In [None]:
 # Checking for distribution of class label(percentages belonging to real class and percentages belonging to fraud class)
 # in the data 1 indicates fraud post
 # 0 indicating real post
 # Plotting pie chart for the data
 # function of Explode function: how the portion will appear (to understand change explode=(0,0.5))

labels = 'Fake', 'Real'
sizes = [df.fraudulent[df['fraudulent']== 1].count(), df.fraudulent[df['fraudulent']== 0].count()]
explode = (0, 0.1)
fig1, ax1 = plt.subplots(figsize=(8, 6)) #size of the pie chart
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.2f%%',
        shadow=True, startangle=120) #autopct %1.2f%% for 2 digit precision
ax1.axis('equal')
plt.title("Proportion of Fraudulent", size = 7)
plt.show()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# No. of charecters in Job Description
df['desc_num_char'] = df['job_description'].apply(len)

# No. of of words in Job Description
df['desc_num_words'] = df['job_description'].apply(lambda x:len(nltk.word_tokenize(x)))

# No. of of sentences in Job Description
df['desc_num_sent'] = df['job_description'].apply(lambda x:len(nltk.sent_tokenize(x)))

df.head()

In [None]:
df.describe(include=np.number)

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Text preprocessing </p>

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative type

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopWords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess(corpus):
    preprocessed = []
    for sentance in tqdm(corpus):
        #sentance = re.sub(r"http+", "", sentance)
        sentance = re.sub(r"http\S+", "", sentance)
        sentance = BeautifulSoup(sentance, 'lxml').get_text()
        sentance = decontracted(sentance)
        sentance = re.sub("\S*\d\S*", "", sentance).strip()
        sentance = re.sub('[^A-Za-z]+', ' ', sentance)
        #sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopWords)
        #sentence = ' '.join(token.lower() for token in nltk.word_tokenize(sentance) if token.lower() not in stopwords.words('english'))
        #preprocessed.append(sentance.strip())
        preprocessed.append(' '.join(token.lower() for token in nltk.word_tokenize(sentance) if token.lower() not in stopwords.words('english')))

    return preprocessed

In [None]:
df.sample(5)

In [None]:
df.dtypes

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
%%time

df["desc_transformed"] = preprocess(df['job_description'].values)
#df["site_transformed"] = preprocess(df['site'].values)
#df["role_transformed"] = preprocess(df['role'].values)
#df["comp_transformed"] = preprocess(df['company_name'].values)
#df["loc_transformed"] = preprocess(df['location'].values)


df["desc_transformed"][:5]

In [None]:
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
# we will plot 3 kind of word cloud
# 1st we will visualize all the words our data using the wordcloud plot
# 2nd we will visualize common words in real job posting
# 3rd we will visualize common words in fraud job posting
# join function is a core python function

In [None]:
# Extract the text from DataFrame
desc_text_data = df['desc_transformed'].str.cat(sep=" ")

# Generate the word cloud
wc = WordCloud()
desc_post = wc.generate(desc_text_data)

# Plot the word cloud
plt.figure(figsize=(13, 8))
plt.imshow(desc_post)
plt.show()

In [None]:
# Extract the real post text from DataFrame
real_post_text_data = df[df['fraudulent']==0]['desc_transformed'].str.cat(sep=" ")

# Generate the word cloud
wc = WordCloud()
real_post = wc.generate(real_post_text_data)

# Plot the word cloud
plt.figure(figsize=(13, 8))
plt.imshow(real_post)
plt.show()

In [None]:
# Extract the fraud post text from DataFrame
fraud_post_text_data = df[df['fraudulent']==1]['desc_transformed'].str.cat(sep=" ")

# Generate the word cloud
wc = WordCloud()
fraud_post = wc.generate(fraud_post_text_data)

# Plot the word cloud
plt.figure(figsize=(13, 8))
plt.imshow(fraud_post)
plt.show()

In [None]:
df.fraudulent.value_counts()

<b>Extract the Opinin from the Lexicon Lib</b>

In [None]:
# Download the lexicon
nltk.download('opinion_lexicon')

In [None]:
# Get positive and negative words from the lexicon
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [None]:
# Positive and negative word count for desc
# Tokenize your desc_text_data
tokens = word_tokenize(desc_text_data)

# Calculate sentiment score
positive_score = sum(word in positive_words for word in tokens)
negative_score = sum(word in negative_words for word in tokens)

# Print the sentiment scores
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)

In [None]:
# Positive and negative word count for real_post
# Tokenize your real_post_text_data
tokens = word_tokenize(real_post_text_data)

# Calculate sentiment score
positive_score = sum(word in positive_words for word in tokens)
negative_score = sum(word in negative_words for word in tokens)

# Print the sentiment scores
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)

In [None]:
# Positive and negative word count for fraud_post
# Tokenize your fraud_post_text_data
tokens = word_tokenize(fraud_post_text_data)

# Calculate sentiment score
positive_score = sum(word in positive_words for word in tokens)
negative_score = sum(word in negative_words for word in tokens)

# Print the sentiment scores
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)

In [None]:
%%time
#OPTIMIZED
# Assuming you have a DataFrame named df with a column 'desc_transformed'
data_for_parallel = df[['desc_transformed']].copy()


# Function to calculate sentiment scores for a single row
def calculate_sentiment_scores(row):
    tokens = word_tokenize(row['desc_transformed'])
    positive_score = np.sum(np.isin(tokens, list(positive_words)))
    negative_score = np.sum(np.isin(tokens, list(negative_words)))
    return positive_score, negative_score

# Convert Pandas DataFrame to a list of dictionaries
data_for_parallel = data_for_parallel.to_dict('records')

# Calculate sentiment scores using GPU and parallelization
with ProcessPoolExecutor() as executor:
    sentiment_scores = list(executor.map(calculate_sentiment_scores, data_for_parallel))

# Assign sentiment scores to the DataFrame
df[['positive_score', 'negative_score']] = np.array(sentiment_scores)

# Display the DataFrame with new sentiment score columns
print(df[['desc_transformed', 'positive_score', 'negative_score']])

In [None]:
df.head()

In [None]:
df_score=df.copy()

In [None]:
df_score.to_csv("Existing_data_score.csv", index=False)

In [None]:
df = pd.read_csv("/kaggle/input/model-data/Existing_data_score.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Specify the size for the training set (e.g., 80%)
train_size = 0.8

# Split the data into training and testing sets
df_train, df_test = train_test_split(df, test_size=1 - train_size, random_state=42)

# Save the DataFrames to separate CSV files
df_train.to_csv('train_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)


In [None]:
df = df_train.copy()

print(f"Shape: {df.shape}")

In [None]:
df['fraudulent'].value_counts()

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;">Model Building </p>

In [None]:
# Fill missing value in 'desc_transformed' with an empty string
df['desc_transformed'].fillna('', inplace=True)

In [None]:
#MODEL BUILDING USING EXISTING DATA - XGBOOST
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBClassifier
import numpy as np

# Assuming df has 'desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent', 'fraudulent' columns
# If needed, fill any missing values in df
df = df.fillna('')

# Extract features and labels
X_text = df['desc_transformed']
X_numeric = df[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]
y = df['fraudulent']

# Split the data into training and testing sets
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)

# TF-IDF vectorization for text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# Combine text TF-IDF vectors with numeric features
X_train_combined = pd.concat([
    X_numeric_train.reset_index(drop=True),
    pd.DataFrame(X_text_train_tfidf.toarray())
], axis=1)

X_test_combined = pd.concat([
    X_numeric_test.reset_index(drop=True),
    pd.DataFrame(X_text_test_tfidf.toarray())
], axis=1)

# Standardize numeric features
scaler = StandardScaler()
X_train_combined.iloc[:, :5] = scaler.fit_transform(X_train_combined.iloc[:, :5])
X_test_combined.iloc[:, :5] = scaler.transform(X_test_combined.iloc[:, :5])

# Identify numeric columns
numeric_columns = ['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']

# Apply Min-Max scaling to numeric features
min_max_scaler = MinMaxScaler()
X_train_combined[numeric_columns] = min_max_scaler.fit_transform(X_train_combined[numeric_columns])
X_test_combined[numeric_columns] = min_max_scaler.transform(X_test_combined[numeric_columns])

# Convert feature names to strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_combined, y_train = smote.fit_resample(X_train_combined, y_train)

# Define XGBoost model
xgb = XGBClassifier(n_estimators=50, random_state=42)

# Define KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create lists to store metrics for each fold
all_y_true = []
all_y_pred = []

# Define custom threshold
custom_threshold = 0.35  # Adjust this threshold based on your requirements

# Iterate over folds
for fold, (train_index, val_index) in enumerate(kf.split(X_train_combined), 1):
    X_train, X_val = X_train_combined.iloc[train_index], X_train_combined.iloc[val_index]
    y_train_batch, y_val_batch = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train the XGBoost model
    xgb.fit(X_train, y_train_batch)

    # Predict probabilities
    y_pred_prob_val = xgb.predict_proba(X_val)[:, 1]

    # Apply custom threshold
    y_pred_val = (y_pred_prob_val > custom_threshold).astype(int)

    # Store true and predicted labels for each fold
    all_y_true.extend(y_val_batch)
    all_y_pred.extend(y_pred_val)

    # Optionally, print metrics for each fold
    print(f"Fold {fold} - Accuracy: {accuracy_score(y_val_batch, y_pred_val)}, Precision: {precision_score(y_val_batch, y_pred_val)}, Recall: {recall_score(y_val_batch, y_pred_val)}, F1: {f1_score(y_val_batch, y_pred_val)}")

# Evaluate on the test set
# Predict probabilities
y_pred_prob_test = xgb.predict_proba(X_test_combined)[:, 1]

# Apply custom threshold
y_pred_test = (y_pred_prob_test > custom_threshold).astype(int)

# Print metrics for the test set
print("\nTest Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(f"Precision: {precision_score(y_test, y_pred_test)}")
print(f"Recall: {recall_score(y_test, y_pred_test)}")
print(f"F1: {f1_score(y_test, y_pred_test)}")

# Confusion matrix for the test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("\nTest Set Confusion Matrix:")
print(conf_matrix_test)

# Classification report for the test set
classification_report_test = classification_report(y_test, y_pred_test, target_names=["Real", "Fake"])
print("\nTest Set Classification Report:")
print(classification_report_test)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Predict probabilities for positive class
y_proba = xgb.predict_proba(X_test_combined)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_proba)
print("AUC Score:", auc_score)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=(f'AUC = {auc_score:.2f}','ROC curve (area = {:.2f})'.format(roc_auc)))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# MODEL BUILDING USING EXISTING DATA - RANDOM FOREST
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier  # Change import to RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
import numpy as np

# Assuming df has 'desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent', 'fraudulent' columns
# If needed, fill any missing values in df
df = df.fillna('')

# Extract features and labels
X_text = df['desc_transformed']
X_numeric = df[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]
y = df['fraudulent']

# Split the data into training and testing sets
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)

# TF-IDF vectorization for text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# Combine text TF-IDF vectors with numeric features
X_train_combined = pd.concat([
    X_numeric_train.reset_index(drop=True),
    pd.DataFrame(X_text_train_tfidf.toarray())
], axis=1)

X_test_combined = pd.concat([
    X_numeric_test.reset_index(drop=True),
    pd.DataFrame(X_text_test_tfidf.toarray())
], axis=1)

# Standardize numeric features
scaler = StandardScaler()
X_train_combined.iloc[:, :5] = scaler.fit_transform(X_train_combined.iloc[:, :5])
X_test_combined.iloc[:, :5] = scaler.transform(X_test_combined.iloc[:, :5])

# Identify numeric columns
numeric_columns = ['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']

# Apply Min-Max scaling to numeric features
min_max_scaler = MinMaxScaler()
X_train_combined[numeric_columns] = min_max_scaler.fit_transform(X_train_combined[numeric_columns])
X_test_combined[numeric_columns] = min_max_scaler.transform(X_test_combined[numeric_columns])

# Convert feature names to strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_combined, y_train = smote.fit_resample(X_train_combined, y_train)

# Define Random Forest model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)

# Define KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create lists to store metrics for each fold
all_y_true = []
all_y_pred = []

# Define custom threshold
custom_threshold = 0.35  # Adjust this threshold based on your requirements

# Iterate over folds
for fold, (train_index, val_index) in enumerate(kf.split(X_train_combined), 1):
    X_train, X_val = X_train_combined.iloc[train_index], X_train_combined.iloc[val_index]
    y_train_batch, y_val_batch = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train the Random Forest model
    rf_model.fit(X_train, y_train_batch)

    # Predict probabilities
    y_pred_prob_val = rf_model.predict_proba(X_val)[:, 1]

    # Apply custom threshold
    y_pred_val = (y_pred_prob_val > custom_threshold).astype(int)

    # Store true and predicted labels for each fold
    all_y_true.extend(y_val_batch)
    all_y_pred.extend(y_pred_val)

    # Optionally, print metrics for each fold
    print(f"Fold {fold} - Accuracy: {accuracy_score(y_val_batch, y_pred_val)}, Precision: {precision_score(y_val_batch, y_pred_val)}, Recall: {recall_score(y_val_batch, y_pred_val)}, F1: {f1_score(y_val_batch, y_pred_val)}")

# Evaluate on the test set
# Predict probabilities
y_pred_prob_test = rf_model.predict_proba(X_test_combined)[:, 1]

# Apply custom threshold
y_pred_test = (y_pred_prob_test > custom_threshold).astype(int)

# Print metrics for the test set
print("\nTest Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(f"Precision: {precision_score(y_test, y_pred_test)}")
print(f"Recall: {recall_score(y_test, y_pred_test)}")
print(f"F1: {f1_score(y_test, y_pred_test)}")

# Confusion matrix for the test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("\nTest Set Confusion Matrix:")
print(conf_matrix_test)

# Classification report for the test set
classification_report_test = classification_report(y_test, y_pred_test, target_names=["Real", "Fake"])
print("\nTest Set Classification Report:")
print(classification_report_test)


In [None]:
# Predict probabilities for positive class
y_proba = rf_model.predict_proba(X_test_combined)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Calculate AUC
auc_score = roc_auc_score(y_test, y_proba)
print("AUC Score:", auc_score)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=(f'AUC = {auc_score:.2f}','ROC curve (area = {:.2f})'.format(roc_auc)))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Check class distribution in training set
print("Training Set Class Distribution:")
print(y_train.value_counts())

# Check class distribution in testing set
print("\nTesting Set Class Distribution:")
print(y_test.value_counts())


In [None]:
# Evaluate XGBoost on the test set
custom_threshold = 0.3
y_pred_prob_test_xgb = xgb.predict_proba(X_test_combined)[:, 1]
y_pred_test_xgb = (y_pred_prob_test_xgb > custom_threshold).astype(int)

# Print metrics for XGBoost
print("\nXGBoost Test Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_xgb)}")
print(f"Precision: {precision_score(y_test, y_pred_test_xgb)}")
print(f"Recall: {recall_score(y_test, y_pred_test_xgb)}")
print(f"F1: {f1_score(y_test, y_pred_test_xgb)}")

# Confusion matrix for XGBoost
conf_matrix_test_xgb = confusion_matrix(y_test, y_pred_test_xgb)
print("\nXGBoost Test Set Confusion Matrix:")
print(conf_matrix_test_xgb)

# Classification report for XGBoost
classification_report_test_xgb = classification_report(y_test, y_pred_test_xgb, target_names=["Real", "Fake"])
print("\nXGBoost Test Set Classification Report:")
print(classification_report_test_xgb)


# Evaluate Random Forest on the test set
y_pred_prob_test_rf = rf_model.predict_proba(X_test_combined)[:, 1]
y_pred_test_rf = (y_pred_prob_test_rf > custom_threshold).astype(int)

# Print metrics for Random Forest
print("\nRandom Forest Test Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_test_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_test_rf)}")
print(f"F1: {f1_score(y_test, y_pred_test_rf)}")

# Confusion matrix for Random Forest
conf_matrix_test_rf = confusion_matrix(y_test, y_pred_test_rf)
print("\nRandom Forest Test Set Confusion Matrix:")
print(conf_matrix_test_rf)

# Classification report for Random Forest
classification_report_test_rf = classification_report(y_test, y_pred_test_rf, target_names=["Real", "Fake"])
print("\nRandom Forest Test Set Classification Report:")
print(classification_report_test_rf)


In [None]:
#Ensemble model using Voting Classsifier
from sklearn.ensemble import VotingClassifier

# Create a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('rf', rf_model)
], voting='soft')  # Use 'soft' for probability-based voting

# Train the ensemble model on the training data
ensemble_model.fit(X_train_combined, y_train)

# Predict probabilities for the test set
y_pred_prob_ensemble = ensemble_model.predict_proba(X_test_combined)[:, 1]

# Apply custom threshold
y_pred_ensemble = (y_pred_prob_ensemble > custom_threshold).astype(int)

# Print metrics for the ensemble model on the test set
print("\nEnsemble Model Test Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble)}")
print(f"Precision: {precision_score(y_test, y_pred_ensemble)}")
print(f"Recall: {recall_score(y_test, y_pred_ensemble)}")
print(f"F1: {f1_score(y_test, y_pred_ensemble)}")

# Confusion matrix for the ensemble model
conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
print("\nEnsemble Model Test Set Confusion Matrix:")
print(conf_matrix_ensemble)

# Classification report for the ensemble model
classification_report_ensemble = classification_report(y_test, y_pred_ensemble, target_names=["Real", "Fake"])
print("\nEnsemble Model Test Set Classification Report:")
print(classification_report_ensemble)

In [None]:
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Assuming ensemble_model is already defined and trained as in your code

# Predict probabilities for the test set
y_pred_prob_ensemble = ensemble_model.predict_proba(X_test_combined)[:, 1]

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob_ensemble)
print(f"\nEnsemble Model Test Set ROC AUC: {roc_auc}")

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_ensemble)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


# Labeling the Unlabeled Data

In [None]:
df_unlabeled = pd.read_csv("/kaggle/input/model-data/preprocessed_dataset_13-12-2023.csv")
print(f"Raw Data Shape: {df_unlabeled.shape}")
df_unlabeled.head()

In [None]:
df_unlabeled.info()

In [None]:
df_unlabeled.describe(include = "all")

In [None]:
df_unlabeled.describe(exclude="object")

In [None]:
df_unlabeled.describe(exclude=np.number)

There are many missing feature values, we will perform futhur checks

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;">Data Checks to perform </p>
 - Check Missing values
 - Check Duplicates
 - Check data type
 - Check the number of unique values of each column
 - Check statistics of data set
 - Check various categories present in the different categorical column

 #### Missing values

In [None]:
df_unlabeled.isnull().sum()

In [None]:
## Here we will check the percentage of nan values present in each feature
feature_na = [feature for feature in df_unlabeled.columns if df_unlabeled[feature].isnull().sum() > 0 ]

for feature in feature_na:
    print(feature, np.round(df_unlabeled[feature].isnull().mean() * 100,4), " % missing")

In [None]:
# Null valuse more than 10%
columns_with_null_values_total = (df_unlabeled.isnull().sum() / len(df_unlabeled)) * 100
columns_with_null_values_percentage = columns_with_null_values_total[columns_with_null_values_total > 10].sort_values(ascending=False)

columns_with_null_values_percentage

<b>observation:</b> There are many missing feature values. There 8 features with ~85% values are null and 2 features with ~65% null.

The feature with 50% null values can be removed from the dataset

In [None]:
df_unlabeled.drop(columns_with_null_values_percentage.keys().tolist(),axis=1,inplace=True)

df_unlabeled.head()

In [None]:
df_unlabeled.columns

In [None]:
df_unlabeled.drop(columns=['site','company_name'],inplace = True)

In [None]:
df_unlabeled.head()

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Duplicate Values Check </p>

In [None]:
df_unlabeled.duplicated().sum()

In [None]:
print("***** Duplicate check Feature wise *****\n")

# Check Job description duplications
print("job_description: ", df_unlabeled["job_description"].duplicated().sum())

There are duplicated records, this has to taken care during preprocessing stage

In [None]:
#Sorting data according to text in ascending order
sorted_data_unlabeled=df_unlabeled.sort_values('job_description', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

sorted_data_unlabeled[sorted_data_unlabeled["job_description"].duplicated()]

In [None]:
df_unlabeled.shape

#### Check the Feature Uniqueness

In [None]:
for feature in df_unlabeled.columns:
    print('{}: Unique Count: {}\n {}\n'.format(feature, len(df_unlabeled[feature].unique()), df_unlabeled[feature].unique()))

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Data preprocessing </p>

<b>Handle Missing Values</b>

In [None]:
df_unlabeled.isnull().sum()

In [None]:
# Fill na values with missing

for feature in df_unlabeled.columns:
  if(df_unlabeled[feature].isnull().sum()>0):
    df_unlabeled[feature].fillna("missing", inplace=True)

df_unlabeled.isnull().sum()

<b>Deduplication of data</b>

In [None]:
df_unlabeled.duplicated().sum()

In [None]:
df_unlabeled.drop_duplicates(keep="first", inplace= True)
print("Shape: ", df_unlabeled.shape)

df_unlabeled.duplicated().sum()

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Exploratory Data Analysis </p>

#### Description: Analyze the count of Charecters, Words, and Sentences in each segment

In [None]:
# No. of charecters in Job Description
df_unlabeled['desc_num_char'] = df_unlabeled['job_description'].apply(len)

# No. of of words in Job Description
df_unlabeled['desc_num_words'] = df_unlabeled['job_description'].apply(lambda x:len(nltk.word_tokenize(x)))

# No. of of sentences in Job Description
df_unlabeled['desc_num_sent'] = df_unlabeled['job_description'].apply(lambda x:len(nltk.sent_tokenize(x)))

df_unlabeled.head()

In [None]:
df_unlabeled.describe(include=np.number)

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#007fd4; font-size:120%; text-align:left;padding: 0px;"> Text preprocessing </p>

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative type

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopWords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess(corpus):
    preprocessed = []
    for sentance in tqdm(corpus):
        #sentance = re.sub(r"http+", "", sentance)
        sentance = re.sub(r"http\S+", "", sentance)
        sentance = BeautifulSoup(sentance, 'lxml').get_text()
        sentance = decontracted(sentance)
        sentance = re.sub("\S*\d\S*", "", sentance).strip()
        sentance = re.sub('[^A-Za-z]+', ' ', sentance)
        #sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopWords)
        #sentence = ' '.join(token.lower() for token in nltk.word_tokenize(sentance) if token.lower() not in stopwords.words('english'))
        #preprocessed.append(sentance.strip())
        preprocessed.append(' '.join(token.lower() for token in nltk.word_tokenize(sentance) if token.lower() not in stopwords.words('english')))

    return preprocessed

In [None]:
df_unlabeled.sample(5)

In [None]:
df_unlabeled.dtypes

In [None]:
%%time

df_unlabeled["desc_transformed"] = preprocess(df_unlabeled['job_description'].values)
#df["site_transformed"] = preprocess(df['site'].values)
#df["role_transformed"] = preprocess(df['role'].values)
#df["comp_transformed"] = preprocess(df['company_name'].values)
#df["loc_transformed"] = preprocess(df['location'].values)


df_unlabeled["desc_transformed"][:5]

In [None]:
df_unlabeled.head()

In [None]:
%%time
#OPTIMIZED
# Function to calculate sentiment scores for a single row
def calculate_sentiment_scores(row):
    tokens = word_tokenize(row['desc_transformed'])
    positive_score = np.sum(np.isin(tokens, list(positive_words)))
    negative_score = np.sum(np.isin(tokens, list(negative_words)))
    return positive_score, negative_score

# Convert Pandas DataFrame to a list of dictionaries
data_for_parallel_unlabeled = df_unlabeled[['desc_transformed']].to_dict('records')

# Calculate sentiment scores using parallel processing
with ProcessPoolExecutor() as executor:
    sentiment_scores_unlabeled = list(executor.map(calculate_sentiment_scores, data_for_parallel_unlabeled))

# Assign sentiment scores to the DataFrame
df_unlabeled[['positive_score', 'negative_score']] = np.array(sentiment_scores_unlabeled)

# Display the DataFrame with new sentiment score columns
print(df_unlabeled[['desc_transformed', 'positive_score', 'negative_score']])

In [None]:
df_unlabeled.head()

In [None]:
df_unlabeled_score = df_unlabeled.copy()

In [None]:
df_unlabeled_score.to_csv("Processed_data_score.csv", index=False)

In [None]:
df_unlabeled = pd.read_csv("/kaggle/input/model-data/Processed_data_score.csv")

In [None]:
df_unlabeled.info()

In [None]:
# Fill missing value in 'desc_transformed' with an empty string
df_unlabeled['desc_transformed'].fillna('', inplace=True)

In [None]:
# Assuming unlabeled_df has 'desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent' columns
# If needed, fill any missing values in unlabeled_df
df_unlabeled = df_unlabeled.fillna('')

# Extract features from unlabeled data
X_text_unlabeled = df_unlabeled['desc_transformed']
X_numeric_unlabeled = df_unlabeled[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]

# TF-IDF vectorization for text data
X_text_unlabeled_tfidf = tfidf_vectorizer.transform(X_text_unlabeled)

# Combine text TF-IDF vectors with numeric features
X_unlabeled_combined = pd.concat([
    X_numeric_unlabeled.reset_index(drop=True),
    pd.DataFrame(X_text_unlabeled_tfidf.toarray())
], axis=1)

# Standardize numeric features
X_unlabeled_combined.iloc[:, :5] = scaler.transform(X_unlabeled_combined.iloc[:, :5])

# Apply Min-Max scaling to numeric features
X_unlabeled_combined[numeric_columns] = min_max_scaler.transform(X_unlabeled_combined[numeric_columns])

# Convert feature names to strings
X_unlabeled_combined.columns = X_unlabeled_combined.columns.astype(str)

# Predict probabilities for unlabeled data using the ensemble model
y_pred_prob_unlabeled = ensemble_model.predict_proba(X_unlabeled_combined)[:, 1]

# Apply custom threshold
custom_threshold_unlabeled = 0.5  # Adjust this threshold based on your requirements
y_pred_unlabeled = (y_pred_prob_unlabeled > custom_threshold_unlabeled).astype(int)

# Add the predicted labels to the unlabeled DataFrame
df_unlabeled['predicted_fraudulent'] = y_pred_unlabeled

# Display or save the labeled unlabeled data
print(df_unlabeled[['desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent', 'predicted_fraudulent']])


In [None]:
df_unlabeled.head()

In [None]:
df_unlabeled["predicted_fraudulent"].value_counts()

In [None]:
df_unlabeled.to_csv("Processed_data_labelled.csv", index=False)

In [None]:
df_unlabeled.info()

In [None]:
df_unlabeled=df_unlabeled.rename(columns = {'predicted_fraudulent':'fraudulent'})

In [None]:
df_unlabeled.info()

In [None]:
df.info()

# Combing both Exisitng and new scrapped labeled data

In [None]:
df_combined = pd.concat([df, df_unlabeled], ignore_index=True)

In [None]:
df_combined.head()

In [None]:
df_combined.info()

In [None]:
df_combined.to_csv("combined_data_labelled.csv", index=False)

In [None]:
df_combined = pd.read_csv("/kaggle/working/combined_data_labelled.csv")

In [None]:
df_combined.head()

# Retraining the model on combined data

In [None]:
custom_threshold

In [None]:
df_combined.info()

In [None]:
df_combined.fillna('', inplace=True)

In [None]:
# Assuming df_combined has 'desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent', 'fraudulent' columns
# If needed, fill any missing values in df_combined
df_combined = pd.concat([df, df_unlabeled], ignore_index=True).fillna('')

# Extract features and labels from combined data
X_text_combined_all = df_combined['desc_transformed']
X_numeric_combined_all = df_combined[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]
y_combined_all = df_combined['fraudulent']

# TF-IDF vectorization for text data
X_text_combined_all_tfidf = tfidf_vectorizer.transform(X_text_combined_all)

# Combine text TF-IDF vectors with numeric features
X_combined_all = pd.concat([
    X_numeric_combined_all.reset_index(drop=True),
    pd.DataFrame(X_text_combined_all_tfidf.toarray())
], axis=1)

# Standardize numeric features
X_combined_all.iloc[:, :5] = scaler.transform(X_combined_all.iloc[:, :5])

# Apply Min-Max scaling to numeric features
X_combined_all[numeric_columns] = min_max_scaler.transform(X_combined_all[numeric_columns])

# Convert feature names to strings
X_combined_all.columns = X_combined_all.columns.astype(str)

# Apply SMOTE to balance the combined data
smote_combined = SMOTE(random_state=42)
X_combined_resampled, y_combined_resampled = smote_combined.fit_resample(X_combined_all, y_combined_all)

# Retrain the ensemble model on the combined and resampled data
ensemble_model.fit(X_combined_resampled, y_combined_resampled)

# Print metrics for the retrained ensemble model on the combined data (optional)
y_pred_prob_combined_all = ensemble_model.predict_proba(X_combined_all)[:, 1]
y_pred_combined_all = (y_pred_prob_combined_all > custom_threshold).astype(int)

print("\nRetrained Ensemble Model Metrics on Combined Data:")
print(f"Accuracy: {accuracy_score(y_combined_all, y_pred_combined_all)}")
print(f"Precision: {precision_score(y_combined_all, y_pred_combined_all)}")
print(f"Recall: {recall_score(y_combined_all, y_pred_combined_all)}")
print(f"F1: {f1_score(y_combined_all, y_pred_combined_all)}")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Confusion matrix
conf_matrix = confusion_matrix(y_combined_all, y_pred_combined_all)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification report
print("\nClassification Report:")
print(classification_report(y_combined_all, y_pred_combined_all))

# ROC AUC
roc_auc = roc_auc_score(y_combined_all, y_pred_prob_combined_all)
print(f"\nROC AUC: {roc_auc}")

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_combined_all, y_pred_prob_combined_all)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()


In [None]:
df_combined.info()

In [None]:
df_combined['fraudulent'].value_counts()

# Testing on unseen test data

In [None]:
df_test_data = pd.read_csv('/kaggle/working/test_data.csv')

In [None]:
df_test_data.info()

In [None]:
df_test['fraudulent'].value_counts()

In [None]:
from sklearn.metrics import precision_recall_curve

# Assuming df_test_data has 'desc_transformed', 'positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent', 'fraudulent' columns
# If needed, fill any missing values in df_test_data
df_test_data = df_test_data.fillna('')

# Extract features and labels from the test data
X_text_test_data = df_test_data['desc_transformed']
X_numeric_test_data = df_test_data[['positive_score', 'negative_score', 'desc_num_char', 'desc_num_words', 'desc_num_sent']]
y_test_data = df_test_data['fraudulent']

# TF-IDF vectorization for text data
X_text_test_data_tfidf = tfidf_vectorizer.transform(X_text_test_data)

# Combine text TF-IDF vectors with numeric features
X_test_data = pd.concat([
    X_numeric_test_data.reset_index(drop=True),
    pd.DataFrame(X_text_test_data_tfidf.toarray())
], axis=1)

# Standardize numeric features
X_test_data.iloc[:, :5] = scaler.transform(X_test_data.iloc[:, :5])

# Apply Min-Max scaling to numeric features
X_test_data[numeric_columns] = min_max_scaler.transform(X_test_data[numeric_columns])

# Convert feature names to strings
X_test_data.columns = X_test_data.columns.astype(str)

# Predict probabilities for the test data using the retrained ensemble model
y_pred_prob_test_data = ensemble_model.predict_proba(X_test_data)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test_data, y_pred_prob_test_data)

# Find the threshold that maximizes the F1 score
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply the optimal threshold
y_pred_optimal_threshold = (y_pred_prob_test_data > optimal_threshold).astype(int)

# Print metrics for the test data using the optimal threshold
print("\nTest Data Metrics with Optimal Threshold:")
print(f"Accuracy: {accuracy_score(y_test_data, y_pred_optimal_threshold)}")
print(f"Precision: {precision_score(y_test_data, y_pred_optimal_threshold)}")
print(f"Recall: {recall_score(y_test_data, y_pred_optimal_threshold)}")
print(f"F1: {f1_score(y_test_data, y_pred_optimal_threshold)}")

# Confusion matrix for the test data with optimal threshold
conf_matrix_optimal_threshold = confusion_matrix(y_test_data, y_pred_optimal_threshold)
print("\nTest Data Confusion Matrix with Optimal Threshold:")
print(conf_matrix_optimal_threshold)

# Classification report for the test data with optimal threshold
classification_report_optimal_threshold = classification_report(y_test_data, y_pred_optimal_threshold, target_names=["Real", "Fake"])
print("\nTest Data Classification Report with Optimal Threshold:")
print(classification_report_optimal_threshold)



# Saving the Model in Pickle format

In [None]:
# Import pickle Package

import pickle

In [None]:
import pickle

# Assuming you have the following objects defined: ensemble_model, tfidf_vectorizer, scaler, min_max_scaler, numeric_columns, optimal_threshold_new_data

# Create a dictionary to store all necessary components
model_data = {
    'model': ensemble_model,
    'tfidf_vectorizer': tfidf_vectorizer,
    'scaler': scaler,
    'min_max_scaler': min_max_scaler,
    'numeric_columns': numeric_columns,
    'optimal_threshold_new_data': optimal_threshold
}

# Specify the filename for the pickle file
Pkl_Filename = "Fake_Job_Postings_Detection_with_preprocessing.pkl"

# Save the dictionary to a pickle file
with open(Pkl_Filename, 'wb') as file:
    pickle.dump(model_data, file)
