In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
from sklearn import *
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
# load the data from 'https://archive.ics.uci.edu/ml/datasets/phishing+websites'
training_data = np.genfromtxt('../input/phishing/phishing.csv', delimiter=',', dtype=np.int32)

In [None]:
training_data

In [None]:
# Identify the inputs (all of the attributes, except for the last one) and the outputs (the last attribute):
inputs = training_data[:,:-1]
outputs = training_data[:,-1]

print(inputs.shape)
print(outputs.shape)

In [None]:
inputs

In [None]:
outputs

In [None]:
# Added by Luiz

# To improve the estimators' accuracy scores, we are going to use the
# sklearn.feature_selection module. This module is used in feature selection or
# dimensionality reduction in the dataset.

# To compute the features' importance, in our case, we are going to use tree-based feature
# selection. Load the sklearn.feature_selection module:

import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

import matplotlib.pyplot as plt
from collections import Counter

In [None]:
# Added by Luiz
# Obtaining the relevant feature set using ExtraTreesClassifier

featselect = sklearn.ensemble.ExtraTreesClassifier().fit(inputs, outputs)
model = SelectFromModel(featselect, prefit=True)
inputs_new = model.transform(inputs)
print (inputs.shape)
print (inputs_new.shape)

In [None]:
# Added by Luiz

# Dataset normalization
min_max_scaler = preprocessing.MinMaxScaler()
inputs_new_scaled = min_max_scaler.fit_transform(inputs_new)
data_new_features = pd.DataFrame(inputs_new_scaled)
# After normalization
data_new_features.describe()

In [None]:
# Added by Luiz

# Outliers
num_cols = data_new_features.columns
plt.figure(figsize=(18,9))
data_new_features[num_cols].boxplot()
plt.title("Numerical variables in dataset", fontsize=20)
plt.show()

Outliers exist only for the first column. Although it seems reasonable to remove those samples, if we remove those that column would be useless for classification, since it would have only a single value, which is zero. So it will remain as-is.

In [None]:
# Added by Luiz

print(f'Percentage of outlier samples (value=1) for first column: {data_new_features[data_new_features[0]==1].shape[0]/data_new_features.shape[0]*100:.2f}%')
print(f'Percentage of standard samples (value=0) for first column: {data_new_features[data_new_features[0]==0].shape[0]/data_new_features.shape[0]*100:.2f}%')

In [None]:
# Added by Luiz

new_inputs = data_new_features.values
new_inputs

In [None]:
# Added by Luiz

print(f'Current size of the inputs: {new_inputs.shape}')
print(f'Current size of the outputs: {outputs.shape}')

In [None]:
# Added by Luiz

# We have roughly the same amount of phishing and non-phishing samples, but we'll proceed to undersampling for a 50%/50% distribution anyway
freq = Counter(outputs)
fishing_perc = freq[1]/(freq[-1] + freq[1])
non_fishing_perc = freq[-1]/(freq[-1] + freq[1])

print(f'Percentage of phishing is: {fishing_perc*100:.2f}%')
print(f'Percentage of non-phishing is: {non_fishing_perc*100:.2f}%')

In [None]:
# Added by Luiz

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Using undersampling to balance the dataset

under_sampler = RandomUnderSampler()
X_res, y_res = under_sampler.fit_resample(new_inputs, outputs)

In [None]:
# Added by Luiz

# After undersampling, the percentage should be even
freq = Counter(y_res)
fishing_perc = freq[1]/(freq[-1] + freq[1])
non_fishing_perc = freq[-1]/(freq[-1] + freq[1])

print(f'Percentage of phishing is: {fishing_perc*100:.2f}%')
print(f'Percentage of non-phishing is: {non_fishing_perc*100:.2f}%')

In [None]:
# dividing the dataset into training and testing:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X_res, y_res, test_size=0.2)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Create the scikit-learn logistic regression classifier with standard parameters
classifier1 = LogisticRegression()

In [None]:
# Create the scikit-learn Decision Tree classifier with standard parameters.
classifier2 = DecisionTreeClassifier()

In [None]:
# Train the classifiers:
classifier1.fit(x_train, y_train)
classifier2.fit(x_train, y_train)

In [None]:
# Make predictions:
predictions1 = classifier1.predict(x_test)
predictions2 = classifier2.predict(x_test)

In [None]:
# print out the accuracy of our phishing detector models:
accuracy1 = 100.0 * accuracy_score(y_test, predictions1)
accuracy2 = 100.0 * accuracy_score(y_test, predictions2)

In [None]:
print ("The accuracy of your Logistic Regression on testing data is: " +str(accuracy1))
print ("The accuracy of your Decision Tree on testing data is: " +str(accuracy2))