In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import load_model
 

warnings.filterwarnings("ignore")

try:
    # Read the CSV file
    df = pd.read_csv("/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")

    # Drop duplicate values
    df.drop_duplicates(inplace=True)

    # Drop columns with null values more than 60%
    missing_values = df.columns[((df.isnull().sum() / len(df.index)) * 100) > 60]
    df.drop(missing_values, axis=1, inplace=True)
    df.drop("job_id", axis=1, inplace=True)

    # Separate input and output variables
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Fill null values in categorical columns with mode
    x_cat = x.select_dtypes(object)
    x_cat = x_cat.fillna(x_cat.mode().iloc[0])

    # Apply one-hot encoding to categorical features
    x_cat = pd.get_dummies(x_cat)

    # Apply standard scaler on numeric data
    x_num = x.select_dtypes(['int64', 'float64'])
    ss = StandardScaler()
    x_num = pd.DataFrame(ss.fit_transform(x_num))

    # Concatenate categorical and numeric data
    x = pd.concat([x_num, x_cat], axis=1)

    # Encode output variable
    le = LabelEncoder()
    y = pd.DataFrame(le.fit_transform(y))

    # Remove rows with missing values
    xy = pd.concat([x, y], axis=1).dropna()
    x = xy.iloc[:, :-1]
    y = xy.iloc[:, -1]

    # Reindex x and y
    x.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    # Split train and test data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

    # Create and compile the model
    ann = Sequential()
    ann.add(Dense(16, activation="relu", input_shape=(x.shape[1],)))
    ann.add(Dropout(rate=0.2))
    ann.add(Dense(8, activation="relu"))
    ann.add(Dropout(rate=0.2))
    ann.add(Dense(units=1, activation="sigmoid"))
    ann.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # Train the model
    ann.fit(x_train, y_train, epochs=30, batch_size=20)

    # Generate predictions
    y_pred = ann.predict(x_test)
    y_pred = np.round(y_pred).astype(int)

    # Print classification report
    print(classification_report(y_test, y_pred))

    # Calculate accuracy
    accuracy = (y_test.values == y_pred).mean() * 100
    print(f"Accuracy: {accuracy}")

    # Save the model
    ann.save("fakejobposting_model")
    storemodel = load_model("fakejobposting_model")

except KeyboardInterrupt:
    print("Program terminated by user.")

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5143
           1       0.91      0.70      0.79       221

    accuracy                           0.98      5364
   macro avg       0.95      0.85      0.89      5364
weighted avg       0.98      0.98      0.98      5364

Accuracy: 92.97181586192592
