In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

# Load the dataset
data = pd.read_csv('data/dataset.csv')

# Display the first few rows of the dataset
data.head()

# Preprocess the dataset (modify according to your dataset structure)
# For example, handle missing values, encode categorical variables, etc.
# data.fillna(method='ffill', inplace=True)
# data = pd.get_dummies(data)

# Define features and target variable
X = data.drop('target', axis=1)  # Assuming 'target' is the name of the target column
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Change to 'softmax' for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Set up model checkpointing
checkpoint = ModelCheckpoint('models/saved_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[checkpoint])

# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary output

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Display classification report
print(classification_report(y_test, y_pred_classes))

# Function to detect data leakage (example logic)
def detect_data_leak(X_train, X_test):
    # This is a placeholder for actual leakage detection logic
    # Compare distributions of features in train vs test
    leakage_detected = False
    for col in X_train.columns:
        if np.any(X_train[col].isin(X_test[col])):
            leakage_detected = True
            print(f'Potential leakage detected in column: {col}')
    return leakage_detected

# Run data leakage detection
leakage = detect_data_leak(X_train, X_test)
if not leakage:
    print('No data leakage detected.')

# Visualizing training history (optional)
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()
