#  Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

In [2]:
df = pd.read_csv('../input/phishing-data/Phishing Data.csv')
df.head()

In [3]:
df.shape

# Check Null Values

In [4]:
df.isnull().sum()

# Drop Duplicate Rows

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.shape

# Count target values

In [7]:
df['Phishing Status'].value_counts()

# Check balancess of data

In [8]:
8240 / (8240 + 442845)

# So, Data is Highly Imbalanced 

# Correlation of independent features of numeric values

In [9]:
df.corr()

In [10]:
plt.figure(figsize=(15,12))
sns.heatmap(df.corr(),annot=True)

In [11]:
col = df.columns
col

# Basic Visualization Using Seaborn

In [12]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Account', hue='Phishing Status', data=df)

In [13]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Bank', hue='Phishing Status', data=df)

In [14]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Identity', hue='Phishing Status', data=df)

In [15]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Inconvenience', hue='Phishing Status', data=df)

In [16]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Security', hue='Phishing Status', data=df)

In [17]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Password', hue='Phishing Status', data=df)

In [18]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Access', hue='Phishing Status', data=df)

In [19]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Click', hue='Phishing Status', data=df)

In [20]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Recently', hue='Phishing Status', data=df)

In [21]:
plt.subplots(figsize=(15,12))
sns.countplot(x='Credit', hue='Phishing Status', data=df)

# Separating Target and Features

In [22]:
x = df.drop('Phishing Status', axis=1)
y = df['Phishing Status']

In [23]:
x.head()

In [24]:
y.head()

In [25]:
x.shape

In [26]:
y.shape

# Preprocessing DataSet

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()

In [29]:
for col in x.columns:
    x[col] = scaler.fit_transform(x[[col]])

In [30]:
x.head()

# Before Sampling

# Hold Out Cross Validation

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.70, random_state=42)

In [None]:
!pip install xgboost

In [33]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

In [34]:
dt = DecisionTreeClassifier()
xgb = XGBClassifier()
ada = AdaBoostClassifier()

In [35]:
model = {'DecisionTree': dt, 'XgBoost': xgb, 'AdaBoost':ada}

In [36]:
for i in  model.keys():
    model[i].fit(x_train, y_train)
    print(f'Model: {model[i]}, Score:{model[i].score(x_test, y_test)}')
    

# K Fold Cross Validation

In [37]:
from sklearn.model_selection import KFold, cross_val_score, cross_validate

In [38]:
kfold = KFold(n_splits=5,shuffle=True,random_state=42)

In [39]:
for i in  model.keys():
    result_k_fold = cross_val_score(model[i], x, y, cv=kfold)
    print(f'Model: {model[i]}, \nScore:{result_k_fold}, \tMean Score:{result_k_fold.mean()}')

# Stratified Cross Validation

In [40]:
from sklearn.model_selection import StratifiedKFold

In [41]:
st_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [42]:
for i in  model.keys():
    result_st_k_fold = cross_val_score(model[i], x, y, cv=st_k_fold)
    print(f'Model: {model[i]}, \nScore:{result_st_k_fold}, \tMean Score:{result_st_k_fold.mean()}')

In [43]:
from sklearn.model_selection import LeaveOneOut

In [44]:
le = LeaveOneOut()

# Imbalanced Data Handling

# Synthetic Minority OverSampling Tecnique(SMOTETomek)

In [45]:
!pip install imblearn

In [47]:
from imblearn.combine import SMOTETomek

In [48]:
sm = SMOTETomek(random_state=42)

In [49]:
y.value_counts()

In [50]:
x_sm, y_sm = sm.fit_resample(x,y)

In [51]:
x_sm.shape

In [52]:
y_sm.shape

In [54]:
y_sm.value_counts()

# K Fold Cross Validation On SMOTETomek

In [55]:
for i in  model.keys():
    result_k_fold = cross_val_score(model[i], x_sm, y_sm, cv=kfold)
    print(f'Model: {model[i]}, \nScore:{result_k_fold}, \tMean Score:{result_k_fold.mean()}')

# Stratified K Fold On SMOTETomek

In [56]:
for i in  model.keys():
    result_st_k_fold = cross_val_score(model[i], x_sm, y_sm, cv=st_k_fold)
    print(f'Model: {model[i]}, \nScore:{result_st_k_fold}, \tMean Score:{result_st_k_fold.mean()}')

# Near Miss For UnderSampling

In [57]:
from imblearn.under_sampling import NearMiss

In [58]:
near_miss = NearMiss()

In [59]:
x_un, y_un = near_miss.fit_resample(x,y)

In [62]:
x_un.shape

In [63]:
y_un.shape

In [64]:
y_un.value_counts()

# K Fold Cross Validation On NearMiss UnderSampling

In [60]:
for i in  model.keys():
    result_k_fold = cross_val_score(model[i], x_un, y_un, cv=kfold)
    print(f'Model: {model[i]}, \nScore:{result_k_fold}, \tMean Score:{result_k_fold.mean()}')

# Stratified K Fold On NearMiss UnderSampling

In [61]:
for i in  model.keys():
    result_st_k_fold = cross_val_score(model[i], x_un, y_un, cv=st_k_fold)
    print(f'Model: {model[i]}, \nScore:{result_st_k_fold}, \tMean Score:{result_st_k_fold.mean()}')

# Leave One Out Cross validation On NearMiss UnderSampling

In [73]:
result_le = cross_val_score(dt, x_un, y_un, cv=le)
print(f'Score:{result_le}, \tMean Score:{result_le.mean()}')

# OverSampling

In [65]:
from imblearn.over_sampling import RandomOverSampler

In [66]:
ros = RandomOverSampler(random_state=42)

In [67]:
x_over, y_over = ros.fit_resample(x,y)

In [68]:
x_over.shape

In [69]:
y_over.shape

In [70]:
y_over.value_counts()

# K Fold Cross Validation On OverSampling

In [71]:
for i in  model.keys():
    result_k_fold = cross_val_score(model[i], x_over, y_over, cv=kfold)
    print(f'Model: {model[i]}, \nScore:{result_k_fold}, \tMean Score:{result_k_fold.mean()}')

# Stratified K Fold On  OverSampling

In [72]:
for i in  model.keys():
    result_st_k_fold = cross_val_score(model[i], x_over, y_over, cv=st_k_fold)
    print(f'Model: {model[i]}, \nScore:{result_st_k_fold}, \tMean Score:{result_st_k_fold.mean()}')