In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Function to calculate and return performance metrics
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc
    }

# Train and evaluate each classifier
def train_and_evaluate_classifiers(df):
    # Separate features (embeddings) and target (output)
    customers = df[['embed_{}'.format(i) for i in range(384)]]
    
    # Convert output to binary classes (0 or 1)
    df['output'] = df['output'].apply(lambda x: 1 if x > 4 else 0)
    targets = df['output']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(customers, targets, test_size=0.2, random_state=42)

    # Initialize models
    models = {
        "MLP": MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, solver='adam', learning_rate_init=0.01),
        "SVM": SVC(kernel='linear', probability=True),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        "CatBoost": CatBoostClassifier(verbose=0),
        "Naïve Bayes": GaussianNB()
    }

    results = []
    
    # Train each model and evaluate
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        result = evaluate_model(y_test, y_pred, model_name)
        results.append(result)
    
    # Create DataFrame to tabulate results
    results_df = pd.DataFrame(results)
    return results_df

# MLPClassifier with RandomizedSearchCV tuning
def train_and_predict_mlp(df):
    """
    Trains an MLPClassifier using RandomizedSearchCV for hyperparameter tuning, and returns predictions.
    """
    # Separate features and target
    customers = df[['embed_{}'.format(i) for i in range(384)]]
    df['output'] = df['output'].apply(lambda x: 1 if x > 4 else 0)
    targets = df['output']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(customers, targets, test_size=0.2, random_state=42)

    # Define MLPClassifier
    mlp = MLPClassifier()

    # Define hyperparameter grid for RandomizedSearchCV
    param_grid_mlp = {
        'hidden_layer_sizes': [(5,), (10,), (50, 50), (100, 100)],
        'activation': ['logistic', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [1e-4, 1e-3, 1e-2],
        'learning_rate_init': [0.001, 0.01, 0.05],
        'max_iter': [500, 1000, 2000]
    }

    # Initialize RandomizedSearchCV for MLP
    random_search_mlp = RandomizedSearchCV(mlp, param_grid_mlp, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)

    # Fit RandomizedSearchCV with training data
    random_search_mlp.fit(X_train, y_train)

    # Get the best estimator
    best_mlp = random_search_mlp.best_estimator_

    # Predict on the test set
    predictions = best_mlp.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    print("Best Parameters found: ", random_search_mlp.best_params_)
    print("Accuracy on test set: {:.2f}%".format(accuracy * 100))

    # Return predictions
    return predictions

def main():
    dataset_path = r"C:\Users\Admin\Downloads\training_mathbert 4.xlsx"
    df = pd.read_excel(dataset_path)

    # Train and evaluate all classifiers
    results_df = train_and_evaluate_classifiers(df)

    # Print comparison table
    print("Model Performance Comparison:\n")
    print(results_df)

    # Train MLP with hyperparameter tuning and print results
    print("\nMLPClassifier with RandomizedSearchCV Tuning:")
    predictions = train_and_predict_mlp(df)
    print("Transaction Classification (MLP):", predictions)

if __name__ == "__main__":
    main()


Parameters: { "use_label_encoder" } are not used.



Model Performance Comparison:

           Model  Accuracy  Precision    Recall  F1-Score   AUC-ROC
0            MLP  0.805310   0.509804  0.577778  0.541667  0.719828
1            SVM  0.800885   0.500000  0.400000  0.444444  0.650276
2  Decision Tree  0.734513   0.363636  0.444444  0.400000  0.625537
3  Random Forest  0.836283   0.666667  0.355556  0.463768  0.655678
4       AdaBoost  0.787611   0.461538  0.400000  0.428571  0.641989
5        XGBoost  0.836283   0.611111  0.488889  0.543210  0.705770
6       CatBoost  0.853982   0.730769  0.422222  0.535211  0.691774
7    Naïve Bayes  0.663717   0.308642  0.555556  0.396825  0.623082

MLPClassifier with RandomizedSearchCV Tuning:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
