<a href="https://colab.research.google.com/github/shahriariit/phishingTL/blob/main/New_Code_Creating_Feature_Selection_SubDatasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import subprocess
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation

# Scikit-learn classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier
)
from sklearn.linear_model import (
    LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier, PassiveAggressiveClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier

# Regression models (incorrectly used as classifiers)
from sklearn.linear_model import Lasso, ElasticNet  # These are not classifiers, they are regressors

# External libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

# Define base estimators for ensemble models
base_estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, auc
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,
    classification_report, roc_curve, roc_auc_score, log_loss, jaccard_score,
    hamming_loss, matthews_corrcoef, cohen_kappa_score, hinge_loss
)
from sklearn.model_selection import cross_val_predict, cross_val_score
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [None]:
url = 'https://media.githubusercontent.com/media/shahriariit/opendataset/master/PhiUSIIL_phishing.csv'
data = pd.read_csv(url)

In [None]:
def subdataset_by_correlation_analysis(data, threshold=0.9):
    """
    Removes highly correlated features from a dataset based on a given threshold.

    Parameters:
        data (pd.DataFrame): The input dataset.
        threshold (float): Correlation threshold for removing features (default is 0.9).

    Returns:
        pd.DataFrame: Dataset with highly correlated features removed.

    """
    if 'label' not in data.columns:
        raise ValueError("Dataset must contain a 'label' column as the target variable.")

    # Split features and target
    M = data.drop(columns=['label'])  # Features
    n = data['label']  # Target variable

    correlation_matrix = M.corr()  # Compute correlation matrix

    # Identify highly correlated feature pairs
    high_corr_var = np.where((correlation_matrix >= threshold) | (correlation_matrix <= -threshold))
    high_corr_pairs = [(correlation_matrix.columns[x], correlation_matrix.columns[y])
                       for x, y in zip(*high_corr_var) if x != y and x < y]

    # Store columns to drop
    columns_to_drop = set()

    for feature1, feature2 in high_corr_pairs:
        columns_to_drop.add(feature1)
        columns_to_drop.add(feature2)  # Keep one feature and remove the other

    reduced_data = M.drop(columns=columns_to_drop, axis=1)
    reduced_data['label'] = n.values  # Add label back

    return reduced_data

def subdataset_by_kbest(data, k=10):
    """
    Selects the top K best features from the dataset using ANOVA F-score.

    Parameters:
        data (pd.DataFrame): The dataset containing features and label.
        k (int): Number of top features to select (default is 10).

    Returns:
        pd.DataFrame: Subset of the dataset with top K features.
    """
    if 'label' not in data.columns:
        raise ValueError("Dataset must contain a 'label' column as the target variable.")

    # Split features and target
    X = data.drop(columns=['label'])  # Features
    y = data['label']  # Target variable

    # Apply SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]

    # Create new DataFrame with selected features and label
    reduced_data = X[selected_features].copy()
    reduced_data['label'] = y.values  # Add label back

    return reduced_data

def subdataset_by_rfe(data):
    M = data.drop('label', axis=1)
    n = data['label']
    rf = RandomForestClassifier(random_state=42) # Aligned with the function definition
    num_features = 15
    rfe = RFE(estimator=rf, n_features_to_select=num_features, step=5)
    rfe.fit(M, n)

    selected_features = M.columns[rfe.support_]
    new_data = M[selected_features].copy()
    new_data['label'] = data['label']

    return new_data

def subdataset_by_mi(data, k=15):

    top_features = top_features_from_mi(data)
    top_feature_names = top_features['feature_name'].head(k).tolist()

    # Include the target variable
    top_feature_names.append('label')
    selected_data = data[top_feature_names]

    return selected_data


def top_features_from_mi(data):

    FIT_FEATURES = SelectKBest(score_func=mutual_info_classif, k='all')
    X = data.drop('label', axis=1)
    y = data['label']

    FIT_FEATURES.fit(X, y)

    score_col = pd.DataFrame(FIT_FEATURES.scores_, columns=['score_value'])
    name_col = pd.DataFrame(X.columns, columns=['feature_name'])

    top_features = pd.concat([name_col, score_col], axis=1)
    top_features_sorted = top_features.sort_values('score_value', ascending=False)

    return top_features_sorted

def  top_features_from_sfs_LR(data):
    log_reg = LogisticRegression(max_iter=1000)  # Default model
    M = data.drop('label', axis=1)
    n = data['label']

    SFS = SequentialFeatureSelector(
        log_reg,
        n_features_to_select=15,  # Select 15 features
        direction="forward",  # Forward selection
        cv=5  # 5-fold cross-validation
    )
    SFS.fit(M, n)
    selected_features = np.array(M.columns)[SFS.get_support()]
    top_feature= M[selected_features]
    #print(selected_features)
    return selected_features, top_feature

def subdataset_by_sfs_GNB(data):
    MODEL = GaussianNB()
    M = data.drop('label', axis=1)
    n = data['label']

    SFS = SequentialFeatureSelector(
        MODEL,
        n_features_to_select=15,
        direction="forward",
        cv=3,
        n_jobs=-1  # Parallel processing
    )
    SFS.fit(M, n) #Fixed the indentation error by removing the extra space
    selected_features = np.array(M.columns)[SFS.get_support()]

    selected_features_list = selected_features.tolist() + ['label']
    #print(selected_features_list)
    return data[selected_features_list]

def subdataset_by_rf(data):
    MODEL = RandomForestClassifier()
    M = data.drop('label', axis=1)
    n = data['label']

    MODEL.fit(M, n)
    feature_importances = MODEL.feature_importances_

    feature_scores = pd.DataFrame({'feature_name': M.columns, 'importance_score': feature_importances})
    top_features = feature_scores.sort_values(by='importance_score', ascending=False).head(15)

    #print(top_features['feature_name'].values)
    return data[top_features['feature_name'].tolist() + ['label']]

def subdataset_by_lr(data):
    MODEL = LogisticRegression(max_iter=1000)  # Increase iterations for convergence
    M = data.drop('label', axis=1)
    n = data['label']

    MODEL.fit(M, n)
    feature_importances = np.abs(MODEL.coef_)[0]  # Get absolute coefficient values

    feature_scores = pd.DataFrame({'feature_name': M.columns, 'importance_score': feature_importances})
    top_features = feature_scores.sort_values(by='importance_score', ascending=False).head(15)

    #print(top_features['feature_name'].values)
    return data[top_features['feature_name'].tolist() + ['label']]


def subdataset_by_pca(data):

    M = data.drop('label', axis=1)
    n = data['label']

    scaler = StandardScaler()
    M_scaled = scaler.fit_transform(M)

    # Apply PCA with automatic selection based on variance retention (85%)
    pca = PCA(n_components=0.85)
    M_pca = pca.fit_transform(M_scaled)

    # Get the number of selected components
    n_selected_components = pca.n_components_

    # Convert PCA components into a DataFrame
    pca_columns = [f'PC{i+1}' for i in range(n_selected_components)]
    pca_df = pd.DataFrame(M_pca, columns=pca_columns)

    # Add the target variable back
    pca_df['label'] = n.reset_index(drop=True)

    #print(f"Selected {n_selected_components} components to retain 85% variance.")
    #print(f"Explained Variance Ratio of Selected Components: {pca.explained_variance_ratio_.sum()}")

    return pca_df

def remove_outliers_iqr(df, k=1.5):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - k * IQR
    upper_bound = Q3 + k * IQR
    return df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

In [None]:
data_fs_1 = subdataset_by_correlation_analysis(data,0.8)
data_fs_2 = subdataset_by_kbest(data)
data_fs_3 = subdataset_by_rfe(data)
data_fs_4 = subdataset_by_mi(data)
data_fs_5 = subdataset_by_rf(data)
data_fs_6 = subdataset_by_lr(data)
data_fs_7 = subdataset_by_sfs_GNB(data)
data_fs_8 = subdataset_by_pca(data)

In [None]:
for i in range(1, 9):
    original_data = globals()[f"data_fs_{i}"]
    cleaned_data = remove_outliers_iqr(original_data)

    globals()[f"data_fs_{i}_no_outliers"] = cleaned_data

    print(f"Dataset{i} shape:", original_data.shape)
    print(f"Dataset{i} shape after removing outliers using IQR method:", cleaned_data.shape)
    print()

Dataset1 shape: (235795, 43)
Dataset1 shape after removing outliers using IQR method: (26872, 43)

Dataset2 shape: (235795, 11)
Dataset2 shape after removing outliers using IQR method: (182259, 11)

Dataset3 shape: (235795, 16)
Dataset3 shape after removing outliers using IQR method: (89135, 16)

Dataset4 shape: (235795, 16)
Dataset4 shape after removing outliers using IQR method: (114027, 16)

Dataset5 shape: (235795, 16)
Dataset5 shape after removing outliers using IQR method: (90421, 16)

Dataset6 shape: (235795, 16)
Dataset6 shape after removing outliers using IQR method: (110147, 16)

Dataset7 shape: (235795, 16)
Dataset7 shape after removing outliers using IQR method: (164007, 16)

Dataset8 shape: (235795, 27)
Dataset8 shape after removing outliers using IQR method: (135302, 27)

