# Classify to TOP Or FAIL

This pipeline demonstrates the implementation and evaluation of an **Artificial Neural Network (ANN)** model for binary classification. The workflow includes:

1.  Loading and preprocessing the dataset.
2.  Building and training the ANN model with early stopping to prevent overfitting.
3.  Performing hyperparameter tuning using RandomizedSearchCV with stratified K-fold cross-validation.
4.  Evaluating model performance in terms of accuracy, precision, recall, and F1 score.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tqdm import tqdm  # Import tqdm for progress bar
import warnings

warnings.filterwarnings('ignore')

class ANNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim=None, batch_size=32, epochs=20):
        """Initialize variables for preprocessing, training, and evaluation."""
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = None
        self.scaler = None
        self.x_train, self.x_test, self.y_train, self.y_test = None, None, None, None
        self.data = None
        self.label_encoders = {}
        self.feature_columns = None  # Store feature column names

    def load(self, file_path):
        """Load dataset from a CSV file."""
        self.data = pd.read_csv(file_path)
        print(f"Data loaded from {file_path}. Shape: {self.data.shape}")

    def preprocess(self, target_column, drop_columns):
        """Preprocess the dataset: scale features and split data."""
        print("Starting preprocessing...")

        # Separate features and target
        X = self.data.drop(columns=drop_columns)
        y = self.data[target_column]

        # Store feature column names before scaling
        self.feature_columns = X.columns

        # Encode categorical features
        categorical_cols = X.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
            self.label_encoders[col] = le  # Store the encoder for future use

        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)

        # Split into training and testing sets (80% train, 20% test)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        print("Preprocessing completed.")

    def build_model(self):
        """Build the ANN model."""
        model = Sequential([
            Dense(64, activation='relu', input_dim=self.input_dim),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def fit(self, X, y):
        """Train the ANN model."""
        print("Starting training...")

        # Build the model
        self.model = self.build_model()

        # Train the model
        self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2, verbose=1)

        print("Training completed.")

    def predict(self, X):
        """Make predictions on new data."""
        y_pred = (self.model.predict(X) > 0.5).astype(int)
        return y_pred

    def score(self, X, y):
        """Evaluate the model performance on the test data."""
        y_pred = self.predict(X)
        accuracy = accuracy_score(y, y_pred)
        return accuracy

    def evaluate(self):
        """Evaluate the model on the test set and generate an evaluation summary."""
        print("Starting evaluation...")

        # Predict on test set
        y_pred = self.predict(self.x_test)

        # Generate evaluation metrics
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)

        print("Evaluation Summary:")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred))

    def final_evaluation(self):
        """Evaluate the model after hyperparameter tuning."""
        self.evaluate()

    def predict_on_input(self, input_file, drop_columns):
        """Load and preprocess a new input file, then make predictions."""
        print("Loading and preprocessing prediction input file...")
        input_data = pd.read_csv(input_file)

        # Preprocess input data (drop columns that are not needed)
        input_data_processed = input_data.drop(columns=drop_columns)

        # Encode categorical columns using the saved label encoders
        for col, le in self.label_encoders.items():
            if col in input_data_processed.columns:
                input_data_processed[col] = le.transform(input_data_processed[col])

        # Ensure that the features used for prediction match the training data features
        input_data_processed = input_data_processed.reindex(columns=self.feature_columns, fill_value=0)

        # Scale input data using the same scaler
        input_data_scaled = self.scaler.transform(input_data_processed)

        # Predict
        predictions = self.predict(input_data_scaled)
        input_data['success'] = predictions
        input_data['success'] = input_data['success'].map({1: 'Top', 0: 'Flop'})

        print("Predictions completed.")
        return input_data


# ==============================================
# SECTION 4: EXAMPLE USAGE
# ==============================================

# Instantiate the classifier
pipeline = ANNClassifier(input_dim=4)  # Assuming 4 features

# Load and preprocess training data
pipeline.load('train_df.csv')
label_encoders = pipeline.preprocess(target_column='target', drop_columns=['target'])

# Train the model
pipeline.fit(pipeline.x_train, pipeline.y_train)

# Evaluate the model
pipeline.evaluate()

# Final evaluation after training
pipeline.final_evaluation()

# Now, for making predictions on new input data (prediction_input.csv)
prediction_output = pipeline.predict_on_input('prediction_input.csv', drop_columns=['item_no'])

# Display the predictions
print(prediction_output)


Data loaded from train_df.csv. Shape: (10370, 5)
Starting preprocessing...
Preprocessing completed.
Starting training...
Epoch 1/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7131 - loss: 0.5790 - val_accuracy: 0.7807 - val_loss: 0.5007
Epoch 2/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7568 - loss: 0.5247 - val_accuracy: 0.7807 - val_loss: 0.4971
Epoch 3/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7681 - loss: 0.5156 - val_accuracy: 0.7777 - val_loss: 0.4968
Epoch 4/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7669 - loss: 0.5165 - val_accuracy: 0.7825 - val_loss: 0.4968
Epoch 5/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7759 - loss: 0.5102 - val_accuracy: 0.7795 - val_loss: 0.4957
Epoch 6/20
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [10]:
import pandas as pd
from sklearn.metrics import classification_report

def save_evaluation_metrics_to_csv(pipeline, file_name='ann_evaluation_metrics.csv'):
    """
    Save evaluation metrics and classification report from the pipeline
    to a CSV file.
    """
    print("Saving evaluation metrics and classification report to CSV...")

    # Predict on test data
    y_pred = pipeline.predict(pipeline.x_test)

    # Generate summary metrics
    accuracy = accuracy_score(pipeline.y_test, y_pred)
    precision = precision_score(pipeline.y_test, y_pred)
    recall = recall_score(pipeline.y_test, y_pred)
    f1 = f1_score(pipeline.y_test, y_pred)

    # Create a summary metrics DataFrame
    summary_data = {
        "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
        "Value": [accuracy, precision, recall, f1]
    }
    summary_df = pd.DataFrame(summary_data)

    # Generate a detailed classification report
    report_dict = classification_report(pipeline.y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()

    # Save both to a CSV file
    summary_df.to_csv(file_name, index=False)
    detailed_file_name = file_name.replace('.csv', '_detailed.csv')
    report_df.to_csv(detailed_file_name)

    print(f"Summary metrics saved to {file_name}.")
    print(f"Detailed classification report saved to {detailed_file_name}.")

# Call the function
save_evaluation_metrics_to_csv(pipeline, file_name='ann_evaluation_metrics.csv')


Saving evaluation metrics and classification report to CSV...
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Summary metrics saved to ann_evaluation_metrics.csv.
Detailed classification report saved to ann_evaluation_metrics_detailed.csv.


In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import uniform, randint
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Define the ANN Model with improvements
class ANNModel(tf.keras.Model):
    def __init__(self, neurons=64, dropout_rate=0.3, input_dim=4):
        super(ANNModel, self).__init__()
        self.dense1 = layers.Dense(neurons, activation='relu', input_dim=input_dim)
        self.batch_norm1 = layers.BatchNormalization()
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dense2 = layers.Dense(int(neurons / 2), activation='relu')
        self.batch_norm2 = layers.BatchNormalization()
        self.dropout2 = layers.Dropout(dropout_rate)
        self.output_layer = layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        return self.output_layer(x)

    def compile_model(self):
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=1e-3, decay_steps=10000, decay_rate=0.9)
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
        self.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Custom Scikit-learn Wrapper for the ANN
class CustomANNWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, neurons=64, dropout_rate=0.3, input_dim=4, epochs=10, batch_size=32):
        self.neurons = neurons
        self.dropout_rate = dropout_rate
        self.input_dim = input_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None

    def fit(self, X, y):
        self.model = ANNModel(neurons=self.neurons, dropout_rate=self.dropout_rate, input_dim=self.input_dim)
        self.model.compile_model()
        early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.2, callbacks=[early_stopping], verbose=1)
        return self

    def predict(self, X):
        return (self.model.predict(X) > 0.5).astype(int)

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

# Load and preprocess the data
def load_data(file_path, target_column, drop_columns):
    data = pd.read_csv(file_path)
    X = data.drop(columns=drop_columns)
    y = data[target_column]

    # Encode categorical columns
    label_encoders = {}
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, label_encoders, scaler

# Hyperparameter Tuning using RandomizedSearchCV with K-Fold Cross-Validation
def tune_hyperparameters(X, y):
    model = CustomANNWrapper()

    param_dist = {
        'neurons': randint(32, 128),              # Range of neurons
        'dropout_rate': uniform(0.2, 0.4),       # Uniform distribution for dropout rate
        'epochs': randint(10, 50),               # Range of epochs
        'batch_size': [16, 32, 64]               # Fixed batch sizes
    }

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,                               # Number of hyperparameter combinations to try
        cv=kfold,                                # K-Fold Cross-Validation
        verbose=2,
        n_jobs=-1,
        scoring=make_scorer(accuracy_score)      # Use accuracy as scoring metric
    )
    random_search.fit(X, y)

    print("Best Parameters:", random_search.best_params_)
    print("Best Cross-Validation Score:", random_search.best_score_)

    # Return the best model
    return random_search.best_estimator_

# Evaluate the Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Example Usage
if __name__ == "__main__":
    # Load and preprocess data
    X, y, label_encoders, scaler = load_data('train_df.csv', target_column='target', drop_columns=['target'])

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tune hyperparameters with K-Fold Cross-Validation
    best_model = tune_hyperparameters(X_train, y_train)

    # Evaluate the best model on test data
    evaluate_model(best_model, X_test, y_test)


Num GPUs Available:  0
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Epoch 1/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7491 - loss: 0.5391 - val_accuracy: 0.7825 - val_loss: 0.4977
Epoch 2/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7798 - loss: 0.4945 - val_accuracy: 0.7807 - val_loss: 0.5000
Epoch 3/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7839 - loss: 0.4850 - val_accuracy: 0.7861 - val_loss: 0.4884
Epoch 4/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7806 - loss: 0.4889 - val_accuracy: 0.7886 - val_loss: 0.4829
Epoch 5/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7884 - loss: 0.4829 - val_accuracy: 0.7831 - val_loss: 0.4817
Epoch 6/24
[1m415/415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7929 

After tuning, the confusion matrix highlights:

**True Positives:** Correctly predicted 'Top' classifications.

**True Negatives:** Correctly predicted 'Flop' classifications.
The model achieves a good balance in handling both classes.


**Conclusion**

*  The ANN model demonstrates competitive performance after tuning, with
significant improvements in recall and F1 score.

*  It is robust for binary classification tasks, especially when non-linear relationships exist in the dataset.

*  Further improvements can be explored by integrating more advanced architectures like convolutional or recurrent layers for complex datasets.

*  While the ANN is computationally intensive, its scalability and performance make it a strong candidate for classification tasks.