<a href="https://www.kaggle.com/code/rubinr12/baselinelstm-ipynb?scriptVersionId=192058999" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, SimpleRNN, GRU, BatchNormalization, Conv1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop, Adam, Nadam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow import keras

In [3]:
def data_pipeline(file_path, window_size=20):
    
    df = pd.read_excel(file_path)
    # Drop the 'job_id' column
    df = df.drop(columns=['job_id'])
    
    # Create sliding windows
    def create_sliding_windows(data, window_size):
        X = []
        y = []
        num_rows = len(data)
        
        if num_rows <= window_size:
            raise ValueError("Data length must be greater than the window size.")
        
        for i in range(len(data) - window_size):
            X.append(data[i:i + window_size, :-1])  
            y.append(data[i + window_size, -1])     
    
        return np.array(X), np.array(y)

    data = df.values
    X, y = create_sliding_windows(data, 20)
    
    # Split sliding windows into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [4]:
file_path = '/kaggle/input/dataset/train.xlsx'
X_train, X_val, X_test, y_train, y_val, y_test = data_pipeline(file_path, window_size=20)

In [9]:
X_train.shape[2]

11

In [12]:
# metrics_new = [
#     keras.metrics.Precision(name="precision"),
#     keras.metrics.Recall(name="recall"),
# ]

# model1 = Sequential()
# model1.add(Input(shape=(window_size, X_train.shape[2])))
# model1.add(LSTM(units = 128, activation='tanh',return_sequences= True))
# model1.add(Dropout(0.2))
# model1.add(LSTM(units = 64, activation='tanh',return_sequences= True))
# model1.add(Dropout(0.2))
# model1.add(LSTM(units = 64, activation='tanh'))
# model1.add(Dense(1, activation='sigmoid'))
# model1.compile(optimizer=optimizer, loss='BinaryCrossentropy', metrics=metrics_new)
# model1.summary()

window_size = 20
learning_rate = 0.001
optimizer = Adam(learning_rate=learning_rate)

model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(window_size, X_train.shape[2])))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=learning_rate), 
              loss='binary_crossentropy', 
              metrics=[Precision(), Recall()])

In [13]:
model.summary()

In [15]:
# # Train the model
# history = model.fit(X_train, y_train, 
#                     epochs=50, 
#                     batch_size=20, 
#                     validation_data=(X_val, y_val), 
#                     verbose=1)

# # Evaluate the model on the test set
# test_loss, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=1)
# print(f"Test Loss: {test_loss}")
# print(f"Test Precision: {test_precision}")
# print(f"Test Recall: {test_recall}")

Epoch 1/50
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.6928 - precision: 0.4846 - recall: 0.1754 - val_loss: 0.6925 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/50
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.6929 - precision: 0.4775 - recall: 0.0475 - val_loss: 0.6924 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/50
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.6929 - precision: 0.5082 - recall: 0.0437 - val_loss: 0.6924 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/50
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - loss: 0.6926 - precision: 0.4530 - recall: 0.0160 - val_loss: 0.6924 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 5/50
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - loss: 0.6928 - precision: 0.4733 - recall: 0.0409 - val_loss: 0.6924 

KeyboardInterrupt: 

In [21]:
# Evaluate the model
test_loss, test_accuracy = model1.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4350 - precision: 0.7777 - recall: 0.7716


ValueError: too many values to unpack (expected 2)

In [None]:
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
#preparing data for the model
#Importing the new data to test the model
df_new = pd.read_csv('job2.csv')

df_new = df_new.drop(columns=['job_id','Latitude', 'Longitude'])


def create_sliding_windows_1(data, window_size):
    X = []
    y = []
    data1=data.values
    for i in range(len(data) - window_size):
        X.append(data1[i:i + window_size, :-1])  
        y.append(data1[i + window_size, -1])     
    
    return np.array(X), np.array(y)


X_new, y_new = create_sliding_windows_1(df_new, 20)


In [17]:
predictions_new = model1.predict(X_new)

check_value = 0.5
predictions_01 = (predictions_new > check_value).astype(int)
y_new_binary = (y_new > check_value).astype(int)


from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_new, predictions_01, average='macro')
recall = recall_score(y_new, predictions_01, average='macro')
f1 = f1_score(y_new, predictions_01, average='macro')
conf_matrix = confusion_matrix(y_new, predictions_01)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step

Precision: 0.5014507586498795

Recall: 0.5182987516303771

F1 Score: 0.3733357025135267

Confusion Matrix:

[[54466 44483]

 [ 1039   983]]


In [None]:
test_loss1, test_accuracy1 = model1.evaluate(X_new, y_new, verbose=1)
print(f"Test Loss: {test_loss1}")
print(f"Test Accuracy: {test_accuracy1}")