<a href="https://www.kaggle.com/code/rubinr12/underfittedlstm-ipynb?scriptVersionId=192164380" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, SimpleRNN, GRU, BatchNormalization, Conv1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

In [None]:
def data_pipeline(file_path, window_size=20):
    df = pd.read_excel(file_path)

    # Drop the 'job_id' column
    df = df.drop(columns=['job_id'])
    
    # Create sliding windows
    def create_sliding_windows(data, window_size):
        X = []
        y = []
        num_rows = len(data)
        
        if num_rows <= window_size:
            raise ValueError("Data length must be greater than the window size.")
        
        for i in range(len(data) - window_size):
            X.append(data[i:i + window_size, :-1])  
            y.append(data[i + window_size, -1])     
    
        return np.array(X), np.array(y)
    
    data = df.values
    X, y = create_sliding_windows(data, 20)
    
    # Split sliding windows into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
file_path = '/kaggle/input/dataset/train.xlsx'
X_train, X_val, X_test, y_train, y_val, y_test = data_pipeline(file_path, window_size=20)

In [None]:
# metrics_new = [
#     keras.metrics.Precision(name="precision"),
#     keras.metrics.Recall(name="recall"),
# ]

# window_size = 20
# learning_rate = 0.0001
# optimizer = Adam(learning_rate=learning_rate)

# model1 = Sequential()
# model1.add(Input(shape=(window_size, X_train.shape[2])))
# model1.add(LSTM(units = 128, activation='tanh',return_sequences= True))
# model1.add(BatchNormalization())
# model1.add(Dropout(0.2))
# model1.add(LSTM(units = 64, activation='tanh',return_sequences= True))
# model1.add(BatchNormalization())
# model1.add(Dropout(0.2))
# model1.add(LSTM(units = 64, activation='tanh'))
# model1.add(BatchNormalization())
# model1.add(Dropout(0.2))
# model1.add(Dense(1, activation='sigmoid'))
# model1.compile(optimizer=optimizer, loss='BinaryCrossentropy', metrics=metrics_new)
# model1.summary()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model1.fit(X_train, y_train,
                     epochs=125,
                     batch_size=128,
                     validation_data=(X_val, y_val),
                     callbacks=[early_stopping],
                     verbose=1)

In [None]:
# Version 2
from keras.models import Sequential
from keras.layers import LSTM, BatchNormalization, Dropout, Dense, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.regularizers import l2

# Define metrics
metrics_new = [
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

# Hyperparameters
window_size = 20
learning_rate = 0.00001
optimizer = Adam(learning_rate=learning_rate)
dropout_rate = 0.3  # Increased dropout for regularization
l2_reg = 0.01  # L2 regularization

# Model architecture
model1 = Sequential()
model1.add(Input(shape=(window_size, X_train.shape[2])))
model1.add(LSTM(units=64, activation='tanh', return_sequences=True))
model1.add(BatchNormalization())
model1.add(Dropout(dropout_rate))
model1.add(LSTM(units=32, activation='tanh', return_sequences=True))
model1.add(BatchNormalization())
model1.add(Dropout(dropout_rate))
model1.add(LSTM(units=32, activation='tanh'))
model1.add(BatchNormalization())
model1.add(Dropout(dropout_rate))
model1.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_reg)))
model1.compile(optimizer=optimizer, loss='BinaryCrossentropy', metrics=metrics_new)
model1.summary()

# Learning rate scheduler
def lr_schedule(epoch, lr):
    if epoch > 10:
        lr = lr * 0.5
    return lr

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
lr_scheduler = LearningRateScheduler(lr_schedule)

# Train the model
history = model1.fit(X_train, y_train,
                     epochs=150,
                     batch_size=64,  # Reduced batch size
                     validation_data=(X_val, y_val),
                     callbacks=[early_stopping, reduce_lr, lr_scheduler],
                     verbose=1)


In [None]:
evaluation_results = model1.evaluate(X_test, y_test, verbose=1)

test_loss = evaluation_results[0]
test_precision = evaluation_results[1]
test_recall = evaluation_results[2]

print(f"Test Loss: {test_loss}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")

In [None]:
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Make predictions
predictions_new = model1.predict(X_new)

# Threshold for binary classification
check_value = 0.5
predictions_01 = (predictions_new > check_value).astype(int)
y_new_binary = (y_new > check_value).astype(int)

# Calculate precision, recall, F1 score, and confusion matrix
precision = precision_score(y_new_binary, predictions_01, average='macro')
recall = recall_score(y_new_binary, predictions_01, average='macro')
f1 = f1_score(y_new_binary, predictions_01, average='macro')
conf_matrix = confusion_matrix(y_new_binary, predictions_01)

# Output the results
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

In [None]:
# Version3
from keras.models import Sequential
from keras.layers import LSTM, BatchNormalization, Dropout, Dense, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.regularizers import l2
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Define hyperparameters
window_size = 20
learning_rate = 0.001
dropout_rate = 0.3
l2_reg = 0.01
batch_size = 64
epochs = 150

In [None]:
# Define model architecture
model = Sequential()
model.add(Input(shape=(window_size, X_train.shape[2])))
model.add(LSTM(units=64, activation='tanh', return_sequences=True, kernel_regularizer=l2(l2_reg)))
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(LSTM(units=32, activation='tanh', return_sequences=True, kernel_regularizer=l2(l2_reg)))
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(LSTM(units=32, activation='tanh', kernel_regularizer=l2(l2_reg)))
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Define learning rate schedule
def lr_schedule(epoch, lr):
    if epoch > 10:
        lr = lr * 0.5
    return lr

In [None]:
# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
lr_scheduler = LearningRateScheduler(lr_schedule)

# Compile model
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='BinaryCrossentropy', metrics=['accuracy', 'precision', 'recall'])

# Train model
history = model.fit(X_train, y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     validation_data=(X_val, y_val),
                     callbacks=[early_stopping, reduce_lr, lr_scheduler],
                     verbose=1)

In [None]:
# Evaluate model
df_new = pd.read_csv('job2.csv')
df_new = df_new.drop(columns=['job_id','Latitude', 'Longitude'])
def create_sliding_windows_1(data, window_size):
    X = []
    y = []
    data1=data.values
    for i in range(len(data) - window_size):
        X.append(data1[i:i + window_size, :-1])  
        y.append(data1[i + window_size, -1])     
    
    return np.array(X), np.array(y)

In [None]:
X_new, y_new = create_sliding_windows_1(df_new, 20)

predictions_new = model1.predict(X_new)

check_value = 0.5
predictions_01 = (predictions_new > check_value).astype(int)
y_new_binary = (y_new > check_value).astype(int)


from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_new, predictions_01, average='macro')
recall = recall_score(y_new, predictions_01, average='macro')
f1 = f1_score(y_new, predictions_01, average='macro')
conf_matrix = confusion_matrix(y_new, predictions_01)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

In [None]:
###################################################################
evaluation_results = model.evaluate(X_test, y_test, verbose=1)

# Make predictions
predictions_new = model.predict(X_test)

# Threshold for binary classification
check_value = 0.5
predictions_01 = (predictions_new > check_value).astype(int)
y_new_binary = (y_test > check_value).astype(int)

# Calculate precision, recall, F1 score, and confusion matrix
precision = precision_score(y_new_binary, predictions_01, average='macro')
recall = recall_score(y_new_binary, predictions_01, average='macro')
f1 = f1_score(y_new_binary, predictions_01, average='macro')
conf_matrix = confusion_matrix(y_new_binary, predictions_01)

# Output the results
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')