In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv(r"../input/phishing-website-detector/phishing.csv")

In [None]:
X= df.drop(columns='class')
Y=df['class']
Y=pd.DataFrame(Y)

In [None]:
X.describe()

# Check if data is blanced (Legit vs Phising)

In [None]:
import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt
pd.value_counts(Y['class']).plot.bar()

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

> # Recursive feature elimination
resource: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=10, verbose=5)
rfe_selector.fit(X_norm, Y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

# CHI2
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
chi_selector = SelectKBest(chi2, k=10)
chi_selector.fit(X_norm, Y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
print(chi_feature)

# Embeded
## Logistic Regression
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(X_norm, Y)
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
print(embeded_lr_feature)

# Feature Extraction
## PCA
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html 
We will use principle component analysis (PCA) for feature extraction. Before PCA, we need to normalize data for better performance of PCA.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
from sklearn.decomposition import PCA
pca = PCA(n_components=25)
Y_sklearn = pca.fit_transform(X_norm)

In [None]:
cum_sum = pca.explained_variance_ratio_.cumsum()

pca.explained_variance_ratio_[:10].sum()

cum_sum = cum_sum*100

fig, ax = plt.subplots(figsize=(8,8))
plt.yticks(np.arange(0,110,10))
plt.bar(range(25), cum_sum, label='Cumulative _Sum_of_Explained _Varaince', color = 'b',alpha=0.5)
plt.title("Around 95% of variance is explained by the First 25 colmns ");

In [None]:
explained_variance=pca.explained_variance_ratio_
print(explained_variance.shape)
print(explained_variance.sum())
with plt.style.context('dark_background'):
    plt.figure(figsize=(6, 4))

    plt.bar(range(25), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

# SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, n_iter=50, random_state=42)
svd.fit(X_norm)
explained_variance=svd.explained_variance_ratio_
print(explained_variance.shape)
print(svd.explained_variance_ratio_.sum())
with plt.style.context('dark_background'):
    plt.figure(figsize=(6, 4))

    plt.bar(range(20), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()