In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

RANDOM_STATE = 42

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Creating a DataFrame out of the dataset

dataset = pd.read_csv(filepath)
print(dataset.head())
print(dataset.shape)

In [None]:
# importing some ML stuff

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_classification

In [None]:
 def get_train_test(dataset, mask_col=None):
        
    # splitting into train and test dataset

    X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

    # X, y = make_classification(n_samples=100, n_features=15, n_classes=3,
    #                            n_informative=4, n_redundant=1, n_repeated=2,
    #                            random_state=1)

    # X, y = pd.DataFrame(X), pd.Series(y).values.ravel()
    # X['Gender'] = X['Gender'].apply(lambda x: 0 if x == 'Male' else 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_STATE)
    
    if mask_col:
        X_train, X_test = X_train[mask_col], X_test[mask_col]
    
    

    print("Train Feature: {0}\nTest Feature: {1}\nTrain Target: {2}\nTest Target: {3}\n".format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))
    
    # bringing all of the features into a similar scale

    scaler = preprocessing.StandardScaler()
    scaler.fit_transform(X_train)
    scaler.transform(X_test)

    print(X_test.head())
    
    # imputing missing or invalid values

    my_imputer = SimpleImputer()
    X_train = my_imputer.fit_transform(X_train)
    X_test = my_imputer.transform(X_test)
    
    return X_train, X_test, y_train, y_test, X, y


In [None]:
X_train, X_test, y_train, y_test, X, y = get_train_test(dataset)

In [None]:
def evaluate(y_pred, y_test):
    
    acc = round(accuracy_score(y_pred, y_test)*100, 2)
    f1 = round(f1_score(y_pred, y_test)*100, 2)
    prec = round(precision_score(y_pred, y_test)*100, 2)
    rec = round(recall_score(y_pred, y_test)*100, 2)
    try:    
        roc = round(roc_auc_score(y_pred, y_test)*100, 2)
    except ValueError:
        roc = 'NA'
    
    print("[INFO]: Accuracy: {0}".format(acc))
    print("[INFO]: F1 Score: {0}".format(f1))
    print("[INFO]: Specificity: {0}".format(prec))
    print("[INFO]: Sensitivity: {0}".format(rec))
    print("[INFO]: Area Under ROC Curve: {0}".format(roc))
    print()

In [None]:
# running the models

for model in [LogisticRegression(random_state=RANDOM_STATE), RandomForestClassifier(random_state=RANDOM_STATE), MLPClassifier(random_state=RANDOM_STATE), SVC(random_state=RANDOM_STATE)]:
    
    print("[INFO]: Fitting", str(model), "...")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    evaluate(y_pred, y_test)

In [None]:
!pip install pyswarms

In [None]:
# applying only PSO for feature selection

import pyswarms as ps

# Define objective function

def f_per_particle(m, alpha):
    
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = X_train.shape[1]
    
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_train_subset = X_train
        X_test_subset = X_test
    else:
        X_train_subset = X_train[:,m==1]
        X_test_subset = X_test[:,m==1]
        
    # Perform classification and store performance in P
    
    model = SVC(random_state=RANDOM_STATE)
    
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_test_subset)
    
    P = f1_score(y_pred, y_test)
    
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_train_subset.shape[1] / total_features)))

    return j

In [None]:
def f(x, alpha=0.88):
    
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
        
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    
    return np.array(j)


In [None]:

# Initialize swarm, arbitrary

options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=X_train.shape[1], options=options)

# Perform optimization

cost, pos_justpso = optimizer.optimize(f, iters=1000, verbose=2)


optimizer.reset()

In [None]:
X_train, X_test, y_train, y_test, X, y= get_train_test(dataset)

In [None]:
# using filter methods before PSO

# ====== Chi2 + PSO ========

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_regression

kbest_chi2 = SelectKBest(chi2, k=9)
X_train = kbest_chi2.fit_transform(X_train, y_train)
X_test = kbest_chi2.transform(X_test)

mask_chi2 = kbest_chi2.get_support()
features_chi2 = X.columns[mask_chi2]
print("Chi2: ", features_chi2)

# Initialize swarm, arbitrary

options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=X_train.shape[1], options=options)

# Perform optimization

cost, pos_chi2 = optimizer.optimize(f, iters=1000, verbose=2)

optimizer.reset()

In [None]:
X_train, X_test, y_train, y_test, X, y = get_train_test(dataset)

In [None]:
# ====== Information Gain + PSO ========

kbest_info_gain = SelectKBest(mutual_info_classif, k=9)
X_train = kbest_info_gain.fit_transform(X_train, y_train)
X_test = kbest_info_gain.transform(X_test)

mask_info_gain = kbest_info_gain.get_support()
features_info_gain = X.columns[mask_info_gain]
print("Information Gain: ", features_info_gain)

# Initialize swarm, arbitrary

options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=X_train.shape[1], options=options)

# Perform optimization

cost, pos_info_gain = optimizer.optimize(f, iters=1000, verbose=2)

optimizer.reset()

In [None]:
X_train, X_test, y_train, y_test, X, y = get_train_test(dataset)

In [None]:
# ====== F-score + PSO ========

kbest_f_reg = SelectKBest(f_regression, k=9)
print(X_train.shape, X_test.shape)
X_train = kbest_f_reg.fit_transform(X_train, y_train)
X_test = kbest_f_reg.transform(X_test)

mask_f_reg = kbest_f_reg.get_support()
features_f_reg = X.columns[mask_f_reg]
print("F-Regression: ", features_f_reg)

# Initialize swarm, arbitrary

options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=X_train.shape[1], options=options)

# Perform optimization

cost, pos_f_reg = optimizer.optimize(f, iters=1000, verbose=2)

optimizer.reset()

In [None]:
# RESULTS ==============

columns_just_pso = list()
master_filter_pso = set()

chi2 = set()
info_gain = set()
f_reg = set()

print("Columns selected with just PSO:\n")

for x, y in zip(X.columns, pos_justpso):
    
    if y == 1:
        print(x)
        columns_just_pso.append(x)
        
print("\nColumns selected with chi2+PSO:\n")

for x, y in zip(X.columns, pos_chi2):
    
    if y == 1:
        print(x)
        chi2.add(x)
        
print("\nColumns selected with infogain+PSO:\n")

for x, y in zip(X.columns, pos_info_gain):
    
    if y == 1:
        print(x)
        info_gain.add(x)

print("\nColumns selected with f-reg+PSO:\n")

for x, y in zip(X.columns, pos_f_reg):
    
    if y == 1:
        print(x)
        f_reg.add(x)

In [None]:
# making the master filter+pso feature list

for feature in chi2.intersection(info_gain).intersection(f_reg):
    master_filter_pso.add(feature)
    
for feature in chi2.intersection(info_gain):
    master_filter_pso.add(feature)
    
for feature in chi2.intersection(f_reg):
    master_filter_pso.add(feature)

for feature in info_gain.intersection(f_reg):
    master_filter_pso.add(feature)
    
print(master_filter_pso)
    

In [None]:
# comparing just PSO with filter + PSO

# Just PSO

X_train, X_test, y_train, y_test, X, y = get_train_test(dataset, columns_just_pso)

# running the models

for model in [LogisticRegression(random_state=RANDOM_STATE), RandomForestClassifier(random_state=RANDOM_STATE), MLPClassifier(random_state=RANDOM_STATE), SVC(random_state=RANDOM_STATE)]:
    
    print("[INFO]: Fitting", str(model), "...")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    evaluate(y_pred, y_test)



In [None]:
# filter +  PSO

X_train, X_test, y_train, y_test, X, y = get_train_test(dataset, list(master_filter_pso))

# running the models

for model in [LogisticRegression(random_state=RANDOM_STATE), RandomForestClassifier(random_state=RANDOM_STATE), MLPClassifier(random_state=RANDOM_STATE), SVC(random_state=RANDOM_STATE)]:
    
    print("[INFO]: Fitting", str(model), "...")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    evaluate(y_pred, y_test)

