<a href="https://colab.research.google.com/github/shahriariit/phishingTL/blob/main/New_Code_Phishing_Data_Analysis_Without_TL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import subprocess
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

# Scikit-learn classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
)
from sklearn.linear_model import (
    LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier, PassiveAggressiveClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier

# Regression models (incorrectly used as classifiers)
from sklearn.linear_model import Lasso, ElasticNet  # These are not classifiers, they are regressors

# External libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

# Define base estimators for ensemble models
base_estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, auc
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,
    classification_report, roc_curve, roc_auc_score, log_loss, jaccard_score,
    hamming_loss, matthews_corrcoef, cohen_kappa_score, hinge_loss
)
from sklearn.model_selection import cross_val_predict, cross_val_score
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [2]:
url = 'https://media.githubusercontent.com/media/shahriariit/opendataset/master/PhiUSIIL_phishing.csv'
data = pd.read_csv(url)

In [3]:
def datapartition(data):
    M = data.drop('label', axis=1)
    n = data['label']
    x_train, x_test, y_train, y_test = train_test_split(M, n, test_size=0.2, random_state=42)
    return x_train, x_test, y_train, y_test

def subdataset_by_correlation_analysis(data, threshold=0.9):
    """
    Removes highly correlated features from a dataset based on a given threshold.

    Parameters:
        data (pd.DataFrame): The input dataset.
        threshold (float): Correlation threshold for removing features (default is 0.9).

    Returns:
        pd.DataFrame: Dataset with highly correlated features removed.

    """
    if 'label' not in data.columns:
        raise ValueError("Dataset must contain a 'label' column as the target variable.")

    # Split features and target
    M = data.drop(columns=['label'])  # Features
    n = data['label']  # Target variable

    correlation_matrix = M.corr()  # Compute correlation matrix

    # Identify highly correlated feature pairs
    high_corr_var = np.where((correlation_matrix >= threshold) | (correlation_matrix <= -threshold))
    high_corr_pairs = [(correlation_matrix.columns[x], correlation_matrix.columns[y])
                       for x, y in zip(*high_corr_var) if x != y and x < y]

    # Store columns to drop
    columns_to_drop = set()

    for feature1, feature2 in high_corr_pairs:
        columns_to_drop.add(feature1)
        columns_to_drop.add(feature2)  # Keep one feature and remove the other

    reduced_data = M.drop(columns=columns_to_drop, axis=1)
    reduced_data['label'] = n.values  # Add label back

    return reduced_data

def subdataset_by_kbest(data, k=10):
    """
    Selects the top K best features from the dataset using ANOVA F-score.

    Parameters:
        data (pd.DataFrame): The dataset containing features and label.
        k (int): Number of top features to select (default is 10).

    Returns:
        pd.DataFrame: Subset of the dataset with top K features.
    """
    if 'label' not in data.columns:
        raise ValueError("Dataset must contain a 'label' column as the target variable.")

    # Split features and target
    X = data.drop(columns=['label'])  # Features
    y = data['label']  # Target variable

    # Apply SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]

    # Create new DataFrame with selected features and label
    reduced_data = X[selected_features].copy()
    reduced_data['label'] = y.values  # Add label back

    return reduced_data

def subdataset_by_rfe(data):
    M = data.drop('label', axis=1)
    n = data['label']
    rf = RandomForestClassifier(random_state=42) # Aligned with the function definition
    num_features = 15
    rfe = RFE(estimator=rf, n_features_to_select=num_features, step=5)
    rfe.fit(M, n)

    selected_features = M.columns[rfe.support_]
    new_data = M[selected_features].copy()
    new_data['label'] = data['label']

    return new_data

def subdataset_by_mi(data, k=15):

    top_features = top_features_from_mi(data)
    top_feature_names = top_features['feature_name'].head(k).tolist()

    # Include the target variable
    top_feature_names.append('label')
    selected_data = data[top_feature_names]

    return selected_data


def top_features_from_mi(data):

    FIT_FEATURES = SelectKBest(score_func=mutual_info_classif, k='all')
    X = data.drop('label', axis=1)
    y = data['label']

    FIT_FEATURES.fit(X, y)

    score_col = pd.DataFrame(FIT_FEATURES.scores_, columns=['score_value'])
    name_col = pd.DataFrame(X.columns, columns=['feature_name'])

    top_features = pd.concat([name_col, score_col], axis=1)
    top_features_sorted = top_features.sort_values('score_value', ascending=False)

    return top_features_sorted

def  top_features_from_sfs_LR(data):
    log_reg = LogisticRegression(max_iter=1000)  # Default model
    M = data.drop('label', axis=1)
    n = data['label']

    SFS = SequentialFeatureSelector(
        log_reg,
        n_features_to_select=15,  # Select 15 features
        direction="forward",  # Forward selection
        cv=5  # 5-fold cross-validation
    )
    SFS.fit(M, n)
    selected_features = np.array(M.columns)[SFS.get_support()]
    top_feature= M[selected_features]
    #print(selected_features)
    return selected_features, top_feature

def subdataset_by_sfs_GNB(data):
    MODEL = GaussianNB()
    M = data.drop('label', axis=1)
    n = data['label']

    SFS = SequentialFeatureSelector(
        MODEL,
        n_features_to_select=15,
        direction="forward",
        cv=3,
        n_jobs=-1  # Parallel processing
    )
    SFS.fit(M, n) #Fixed the indentation error by removing the extra space
    selected_features = np.array(M.columns)[SFS.get_support()]

    selected_features_list = selected_features.tolist() + ['label']
    #print(selected_features_list)
    return data[selected_features_list]

def subdataset_by_rf(data):
    MODEL = RandomForestClassifier()
    M = data.drop('label', axis=1)
    n = data['label']

    MODEL.fit(M, n)
    feature_importances = MODEL.feature_importances_

    feature_scores = pd.DataFrame({'feature_name': M.columns, 'importance_score': feature_importances})
    top_features = feature_scores.sort_values(by='importance_score', ascending=False).head(15)

    #print(top_features['feature_name'].values)
    return data[top_features['feature_name'].tolist() + ['label']]

def subdataset_by_lr(data):
    MODEL = LogisticRegression(max_iter=1000)  # Increase iterations for convergence
    M = data.drop('label', axis=1)
    n = data['label']

    MODEL.fit(M, n)
    feature_importances = np.abs(MODEL.coef_)[0]  # Get absolute coefficient values

    feature_scores = pd.DataFrame({'feature_name': M.columns, 'importance_score': feature_importances})
    top_features = feature_scores.sort_values(by='importance_score', ascending=False).head(15)

    #print(top_features['feature_name'].values)
    return data[top_features['feature_name'].tolist() + ['label']]


def subdataset_by_pca(data):

    M = data.drop('label', axis=1)
    n = data['label']

    scaler = StandardScaler()
    M_scaled = scaler.fit_transform(M)

    # Apply PCA with automatic selection based on variance retention (85%)
    pca = PCA(n_components=0.85)
    M_pca = pca.fit_transform(M_scaled)

    # Get the number of selected components
    n_selected_components = pca.n_components_

    # Convert PCA components into a DataFrame
    pca_columns = [f'PC{i+1}' for i in range(n_selected_components)]
    pca_df = pd.DataFrame(M_pca, columns=pca_columns)

    # Add the target variable back
    pca_df['label'] = n.reset_index(drop=True)

    #print(f"Selected {n_selected_components} components to retain 85% variance.")
    #print(f"Explained Variance Ratio of Selected Components: {pca.explained_variance_ratio_.sum()}")

    return pca_df

def remove_outliers_iqr(df, k=1.5):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - k * IQR
    upper_bound = Q3 + k * IQR
    return df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

def z_score_transformation(df_train, df_test):
    scaler = StandardScaler()
    df_train_scaled = scaler.fit_transform(df_train)
    df_test_scaled = scaler.transform(df_test)
    return df_train_scaled, df_test_scaled

def min_max_transformation(df_train, df_test):
    scaler = MinMaxScaler()
    df_train_scaled = scaler.fit_transform(df_train)
    df_test_scaled = scaler.transform(df_test)
    return df_train_scaled, df_test_scaled

def log_transformation(df_train, df_test):
    df_train_log = np.log1p(df_train)
    df_test_log = np.log1p(df_test)
    return df_train_log, df_test_log

def gmean_score(y_true, y_pred, average='macro'):
    """
    Calculate the Geometric Mean (G-mean) for multiclass classification.

    Args:
        y_true: True labels (array-like).
        y_pred: Predicted labels (array-like).
        average: Averaging method ('macro', 'weighted', or None).

    Returns:
        G-mean score (scalar or list if average=None).
    """
    cm = confusion_matrix(y_true, y_pred)
    n_classes = cm.shape[0]
    total_samples = cm.sum()

    gmeans = []
    supports = []

    for k in range(n_classes):
        # True Positives (TP) and False Negatives (FN)
        TP = cm[k, k]
        FN = cm[k, :].sum() - TP

        # False Positives (FP) and True Negatives (TN)
        FP = cm[:, k].sum() - TP
        TN = total_samples - TP - FN - FP

        # Handle division by zero
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0.0
        specificity = TN / (TN + FP) if (TN + FP) != 0 else 0.0

        # Compute G-mean for the class
        gmean = np.sqrt(recall * specificity) if (recall * specificity) >= 0 else 0.0
        gmeans.append(gmean)
        supports.append(TP + FN)  # Support (number of true samples)

    # Apply averaging
    if average == 'macro':
        return np.mean(gmeans)
    elif average == 'weighted':
        return np.average(gmeans, weights=supports)
    elif average is None:
        return gmeans
    else:
        raise ValueError("Invalid averaging method. Use 'macro', 'weighted', or None.")

def specificity_score(y_true, y_pred, average='macro'):
    """
    Calculate specificity for multiclass classification.

    Args:
        y_true: Array of true labels.
        y_pred: Array of predicted labels.
        average: Averaging method ('macro', 'weighted', or None).

    Returns:
        Specificity score (scalar or list if average=None).
    """
    cm = confusion_matrix(y_true, y_pred)
    n_classes = cm.shape[0]
    specificities = []

    for k in range(n_classes):
        # True negatives: Remove row k and column k, sum remaining elements
        tn = np.delete(np.delete(cm, k, axis=0), k, axis=1).sum()
        # False positives: Sum column k excluding the diagonal (true positives)
        fp = cm[:, k].sum() - cm[k, k]

        denominator = tn + fp
        specificity = tn / denominator if denominator != 0 else 0.0
        specificities.append(specificity)

    if average == 'macro':
        return np.mean(specificities)
    elif average == 'weighted':
        # Weight by the number of actual negatives per class
        class_counts = cm.sum(axis=1)
        total_samples = cm.sum()
        weights = (total_samples - class_counts)  # Number of negatives per class
        return np.average(specificities, weights=weights)
    elif average is None:
        return specificities
    else:
        raise ValueError("Invalid averaging method. Use 'macro', 'weighted', or None.")

def classification_metrics(X_labeled, X_unlabeled, y_labeled, y_unlabeled, output_file="classification_metrics.csv"):
    models = {
       "Linear SVC": LinearSVC(random_state=42, max_iter=5000),
       "Random Forest": RandomForestClassifier(random_state=42),
       "Gradient Boosting": GradientBoostingClassifier(random_state=42),
       "Logistic Regression": LogisticRegression(),
       "Decision Tree": DecisionTreeClassifier(random_state=42),
       "Naive Bayes": GaussianNB(),
       "AdaBoost": AdaBoostClassifier(random_state=42),
       "Logistic Regression": LogisticRegression(),
       "Ridge Classifier": RidgeClassifier(),
       "Perceptron": Perceptron(),
       "SGDClassifier": SGDClassifier(),
       "PassiveAggressiveClassifier": PassiveAggressiveClassifier(),
       "ExtraTreesClassifier": ExtraTreesClassifier(),
       "BaggingClassifier": BaggingClassifier(),
       "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
       "LGBMClassifier": LGBMClassifier(verbosity=-1)
    }

    # Open CSV file for writing
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the header row
        writer.writerow(["Model", "Accuracy", "Cohen-Kappa", "Precision", "Recall", "Specificity", "F1-Score", "G-Mean", "AUROC", "Logloss", "Jaccard", "Hamming","MCC"])

        for name, model in models.items():
            #print(f"--- {name} ---")
            model.fit(X_labeled, y_labeled)
            #y_pred = model.predict(X_unlabeled)
            y_pred = cross_val_predict(model, X_unlabeled, y_unlabeled, cv=5)
            y_proba = model.predict_proba(X_unlabeled)[:, 1] if hasattr(model, 'predict_proba') else None

            # Convert probabilities to binary classification
            #if hasattr(model, 'predict_proba'):
            #   y_pred = (model.predict_proba(X_unlabeled)[:, 1] >= 0.5).astype(int)
            #elif hasattr(model, 'decision_function'):
            #   y_pred = (model.decision_function(X_unlabeled) >= 0).astype(int)
            #else:
            #   y_pred = (y_pred >= 0).astype(int)

            accuracy = accuracy_score(y_unlabeled, y_pred)
            cohen_kappa = cohen_kappa_score(y_unlabeled, y_pred)
            precision = precision_score(y_unlabeled, y_pred, average='weighted')
            recall = recall_score(y_unlabeled, y_pred, average='weighted')
            specificity = specificity_score(y_unlabeled, y_pred, average='weighted')
            f_measure = f1_score(y_unlabeled, y_pred, average='weighted')
            gmean = gmean_score(y_unlabeled, y_pred,  average='weighted')

            # ROC-AUC (for multi-class, use `ovr` strategy)
            # Calculate ROC AUC only if y_proba is available
            if y_proba is not None:
                roc_auc = roc_auc_score(y_unlabeled, y_proba, multi_class='ovr', average='weighted')
                logloss = log_loss(y_unlabeled, y_proba)
            else:
                roc_auc = np.nan  # or some other indicator for missing value
                logloss = np.nan

            jaccard = jaccard_score(y_unlabeled, y_pred, average='weighted')
            hamming = hamming_loss(y_unlabeled, y_pred)
            mcc = matthews_corrcoef(y_unlabeled, y_pred)

            writer.writerow([name, accuracy, cohen_kappa, precision, recall, specificity, f_measure, gmean, roc_auc, logloss, jaccard, hamming, mcc])

            #print(f"Accuracy: {accuracy:.4f}")
            #print(f"Cohen's Kappa Score: {cohen_kappa:.4f}")
            #print(f"Precision: {precision:.4f}")
            #print(f"Recall: {recall:.4f}")
            #print(f"Specificity (True Negative Rate): {specificity_score:.4f}")
            #print(f"F-Measure (F1 Score): {f_measure:.4f}")
            #print(f"G-Mean Score: {gmean_score:.4f}")
            #print(f"ROC-AUC Score: {roc_auc:.4f}")
            #print(f"Log Loss: {logloss:.4f}")
            #print(f"Jaccard Score: {jaccard:.4f}")
            #print(f"Hamming Loss: {hamming:.4f}")
            #print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")


    print(f"Classification metrics saved to {output_file}")

In [4]:
data_fs_1 = pd.read_csv("https://media.githubusercontent.com/media/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_1.csv")
data_fs_2 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_2.csv")
data_fs_3 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_3.csv")
data_fs_4 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_4.csv")
data_fs_5 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_5.csv")
data_fs_6 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_6.csv")
data_fs_7 = pd.read_csv("https://raw.githubusercontent.com/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_7.csv")
data_fs_8 = pd.read_csv("https://media.githubusercontent.com/media/muajnstu/phishingDataAfteFS/refs/heads/main/data_fs_8.csv")

In [5]:
for i in range(1, 9):
    original_data = globals()[f"data_fs_{i}"]
    cleaned_data = remove_outliers_iqr(original_data)

    globals()[f"data_fs_{i}_no_outliers"] = cleaned_data

    print(f"Dataset{i} shape:", original_data.shape)
    print(f"Dataset{i} shape after removing outliers using IQR method:", cleaned_data.shape)
    print()

Dataset1 shape: (235795, 43)
Dataset1 shape after removing outliers using IQR method: (26872, 43)

Dataset2 shape: (235795, 11)
Dataset2 shape after removing outliers using IQR method: (182259, 11)

Dataset3 shape: (235795, 16)
Dataset3 shape after removing outliers using IQR method: (89135, 16)

Dataset4 shape: (235795, 16)
Dataset4 shape after removing outliers using IQR method: (114027, 16)

Dataset5 shape: (235795, 16)
Dataset5 shape after removing outliers using IQR method: (90421, 16)

Dataset6 shape: (235795, 16)
Dataset6 shape after removing outliers using IQR method: (110147, 16)

Dataset7 shape: (235795, 16)
Dataset7 shape after removing outliers using IQR method: (164007, 16)

Dataset8 shape: (235795, 27)
Dataset8 shape after removing outliers using IQR method: (135302, 27)



In [6]:
X_train, X_test, y_train, y_test = datapartition(data)

In [None]:
classification_metrics(X_train, X_test, y_train, y_test)

Classification metrics saved to classification_metrics.csv


In [7]:
for i in range(1, 9):
    # Data partitioning
    (globals()[f"X_train_data_fs_{i}"],
     globals()[f"X_test_data_fs_{i}"],
     globals()[f"y_train_data_fs_{i}"],
     globals()[f"y_test_data_fs_{i}"]) = datapartition(globals()[f"data_fs_{i}"])

In [None]:
for i in range(1, 9):
    # Metric calculation
    classification_metrics(
        globals()[f"X_train_data_fs_{i}"],
        globals()[f"X_test_data_fs_{i}"],
        globals()[f"y_train_data_fs_{i}"],
        globals()[f"y_test_data_fs_{i}"],
        f"classification_metrics_fs{i}.csv"
    )

Classification metrics saved to classification_metrics_fs1.csv
Classification metrics saved to classification_metrics_fs2.csv
Classification metrics saved to classification_metrics_fs3.csv
Classification metrics saved to classification_metrics_fs4.csv
Classification metrics saved to classification_metrics_fs5.csv
Classification metrics saved to classification_metrics_fs6.csv
Classification metrics saved to classification_metrics_fs7.csv
Classification metrics saved to classification_metrics_fs8.csv


In [8]:
for i in range(1, 9):
    # Data partitioning
    (globals()[f"X_train_data_fs_no_{i}"],
     globals()[f"X_test_data_fs_no_{i}"],
     globals()[f"y_train_data_fs_no_{i}"],
     globals()[f"y_test_data_fs_no_{i}"]) = datapartition(globals()[f"data_fs_{i}_no_outliers"])

In [None]:
for i in range(1, 9):
    # Classification metrics
    classification_metrics(
        globals()[f"X_train_data_fs_no_{i}"],
        globals()[f"X_test_data_fs_no_{i}"],
        globals()[f"y_train_data_fs_no_{i}"],
        globals()[f"y_test_data_fs_no_{i}"],
        f"classification_metrics_fs{i}_no_outliers.csv"
    )

Classification metrics saved to classification_metrics_fs1_no_outliers.csv
Classification metrics saved to classification_metrics_fs2_no_outliers.csv
Classification metrics saved to classification_metrics_fs3_no_outliers.csv
Classification metrics saved to classification_metrics_fs4_no_outliers.csv
Classification metrics saved to classification_metrics_fs5_no_outliers.csv
Classification metrics saved to classification_metrics_fs6_no_outliers.csv
Classification metrics saved to classification_metrics_fs7_no_outliers.csv
Classification metrics saved to classification_metrics_fs8_no_outliers.csv


In [None]:
X_train_scaled, X_test_scaled =z_score_transformation(X_train, X_test)
classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test,"classification_metrics_scaled.csv")



Classification metrics saved to classification_metrics_scaled.csv


In [None]:
for i in range(1, 9):
    X_train = globals()[f"X_train_data_fs_{i}"]
    X_test = globals()[f"X_test_data_fs_{i}"]
    y_train = globals()[f"y_train_data_fs_{i}"]
    y_test = globals()[f"y_test_data_fs_{i}"]

    # Apply z-scaling
    X_train_scaled, X_test_scaled = z_score_transformation(X_train, X_test)

    # Store scaled data back to variables (optional if not needed later)
    globals()[f"X_train_scaled_fs_{i}"] = X_train_scaled
    globals()[f"X_test_scaled_fs_{i}"] = X_test_scaled

    # Generate metrics filename
    metrics_file = f"classification_metrics_scaled_fs{i}.csv"

    # Calculate and save metrics
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)



Classification metrics saved to classification_metrics_scaled_fs1.csv




Classification metrics saved to classification_metrics_scaled_fs2.csv




Classification metrics saved to classification_metrics_scaled_fs3.csv




Classification metrics saved to classification_metrics_scaled_fs4.csv




Classification metrics saved to classification_metrics_scaled_fs5.csv




Classification metrics saved to classification_metrics_scaled_fs6.csv




Classification metrics saved to classification_metrics_scaled_fs7.csv




Classification metrics saved to classification_metrics_scaled_fs8.csv


In [None]:
for i in range(1, 9):
    X_train = globals()[f"X_train_data_fs_no_{i}"]
    X_test = globals()[f"X_test_data_fs_no_{i}"]
    y_train = globals()[f"y_train_data_fs_no_{i}"]
    y_test = globals()[f"y_test_data_fs_no_{i}"]

    # Apply z-scaling
    X_train_scaled, X_test_scaled = z_score_transformation(X_train, X_test)

    # Store scaled data back to variables (optional if not needed later)
    globals()[f"X_train_scaled_fs_no_{i}"] = X_train_scaled
    globals()[f"X_test_scaled_fs_no_{i}"] = X_test_scaled

    # Generate metrics filename
    metrics_file = f"classification_metrics_scaled_fs{i}_no_outliers.csv"

    # Calculate and save metrics
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)



Classification metrics saved to classification_metrics_scaled_fs1_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs2_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs3_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs4_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs5_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs6_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs7_no_outliers.csv




Classification metrics saved to classification_metrics_scaled_fs8_no_outliers.csv


In [None]:
X_train_mxscaled, X_test_mxscaled = min_max_transformation(X_train, X_test)
classification_metrics(X_train_mxscaled, X_test_mxscaled, y_train, y_test,"classification_metrics_mxscaled.csv")



Classification metrics saved to classification_metrics_mxscaled.csv


In [None]:
for i in range(1, 9):
    X_train = globals()[f"X_train_data_fs_{i}"]
    X_test = globals()[f"X_test_data_fs_{i}"]
    y_train = globals()[f"y_train_data_fs_{i}"]
    y_test = globals()[f"y_test_data_fs_{i}"]

    # Apply min-max scaling
    X_train_scaled, X_test_scaled = min_max_transformation(X_train, X_test)

    # Store scaled data back to variables (optional if not needed later)
    globals()[f"X_train_mxscaled_fs_{i}"] = X_train_scaled
    globals()[f"X_test_mxscaled_fs_{i}"] = X_test_scaled

    # Generate metrics filename
    metrics_file = f"classification_metrics_mxscaled_fs{i}.csv"

    # Calculate and save metrics
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)



Classification metrics saved to classification_metrics_mxscaled_fs1.csv




Classification metrics saved to classification_metrics_mxscaled_fs2.csv




Classification metrics saved to classification_metrics_mxscaled_fs3.csv




Classification metrics saved to classification_metrics_mxscaled_fs4.csv




Classification metrics saved to classification_metrics_mxscaled_fs5.csv




Classification metrics saved to classification_metrics_mxscaled_fs6.csv




Classification metrics saved to classification_metrics_mxscaled_fs7.csv




Classification metrics saved to classification_metrics_mxscaled_fs8.csv


In [None]:
for i in range(1, 9):
    X_train = globals()[f"X_train_data_fs_no_{i}"]
    X_test = globals()[f"X_test_data_fs_no_{i}"]
    y_train = globals()[f"y_train_data_fs_no_{i}"]
    y_test = globals()[f"y_test_data_fs_no_{i}"]

    # Apply min-max scaling
    X_train_scaled, X_test_scaled = min_max_transformation(X_train, X_test)

    # Store scaled data back to variables (optional if not needed later)
    globals()[f"X_train_mxscaled_fs_no_{i}"] = X_train_scaled
    globals()[f"X_test_mxscaled_fs_no_{i}"] = X_test_scaled

    # Generate metrics filename
    metrics_file = f"classification_metrics_mxscaled_fs{i}_no_outliers.csv"

    # Calculate and save metrics
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)



Classification metrics saved to classification_metrics_mxscaled_fs1_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs2_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs3_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs4_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs5_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs6_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs7_no_outliers.csv




Classification metrics saved to classification_metrics_mxscaled_fs8_no_outliers.csv




In [9]:
X_train_logscaled, X_test_logscaled = log_transformation(X_train, X_test)

In [10]:
classification_metrics(X_train_logscaled, X_test_logscaled, y_train, y_test,"classification_metrics_logscaled.csv")

Classification metrics saved to classification_metrics_logscaled.csv


In [11]:
for i in range(1, 8):
    X_train = globals()[f"X_train_data_fs_{i}"]
    X_test = globals()[f"X_test_data_fs_{i}"]
    y_train = globals()[f"y_train_data_fs_{i}"]
    y_test = globals()[f"y_test_data_fs_{i}"]

    X_train_scaled, X_test_scaled = log_transformation(X_train, X_test)

    globals()[f"X_train_logscaled_fs_{i}"] = X_train_scaled
    globals()[f"X_test_logscaled_fs_{i}"] = X_test_scaled

    metrics_file = f"classification_metrics_logscaled_fs{i}.csv"
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)

Classification metrics saved to classification_metrics_logscaled_fs1.csv
Classification metrics saved to classification_metrics_logscaled_fs2.csv
Classification metrics saved to classification_metrics_logscaled_fs3.csv
Classification metrics saved to classification_metrics_logscaled_fs4.csv
Classification metrics saved to classification_metrics_logscaled_fs5.csv
Classification metrics saved to classification_metrics_logscaled_fs6.csv
Classification metrics saved to classification_metrics_logscaled_fs7.csv


In [12]:
for i in range(1, 8):
    X_train = globals()[f"X_train_data_fs_no_{i}"]
    X_test = globals()[f"X_test_data_fs_no_{i}"]
    y_train = globals()[f"y_train_data_fs_no_{i}"]
    y_test = globals()[f"y_test_data_fs_no_{i}"]

    X_train_scaled, X_test_scaled = log_transformation(X_train, X_test)

    globals()[f"X_train_logscaled_fs_no_{i}"] = X_train_scaled
    globals()[f"X_test_logscaled_fs_no_{i}"] = X_test_scaled

    metrics_file = f"classification_metrics_logscaled_fs{i}_no_outliers.csv"
    classification_metrics(X_train_scaled, X_test_scaled, y_train, y_test, metrics_file)

Classification metrics saved to classification_metrics_logscaled_fs1_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs2_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs3_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs4_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs5_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs6_no_outliers.csv
Classification metrics saved to classification_metrics_logscaled_fs7_no_outliers.csv
