In [None]:
# Project: Phishing Websites Detection

# Aim
# The goal of this project is to apply machine learning techniques to the Phishing Websites dataset. We will:
# - Perform data discovery, cleaning, exploration, analysis, and transformation.
# - Train, evaluate, and compare baseline and ensemble models.
# - Optimize hyperparameters and analyze the results.
# - Provide insights and outline potential improvements.

# Dataset Description
# - Dataset URL: https://openml.org/d/4534
# - Source: OpenML
# - The dataset contains numerical and categorical features for phishing website classification.
# - Number of features: 30
# - Number of instances: 11,055
# - Task: Classification (binary, target: phishing or not phishing)

# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Checking library versions
# pandas==1.3.3
# scikit-learn==1.0
# seaborn==0.11.2
# numpy==1.21.2

# Data Access Instructions
data_url = "https://openml.org/data/get_csv/1795974/phishing.csv"

# Load Dataset
data = pd.read_csv(data_url)
data.head()

# Dataset Exploratory Analysis
## Metadata
data.info()
print(f"Number of instances: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")

## Null Values
null_counts = data.isnull().sum()
print(null_counts[null_counts > 0])

# Interpretation: The dataset contains no missing values.

## Target Feature Analysis
sns.countplot(data['Result'])
plt.title('Class Distribution')
plt.show()

# Observation: The classes appear imbalanced. We'll use class balancing techniques if necessary.

## Feature Distributions
sns.boxplot(data=data.drop(columns=['Result']))
plt.xticks(rotation=90)
plt.title('Feature Value Distributions')
plt.show()

# Handling Outliers
# We'll use scaling and standardization to manage outliers during preprocessing.

# Feature Correlation
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

# Selecting Features
# Based on domain knowledge and correlation, we will retain all features for initial pipelines.

# ML Baseline Models
## Train/Test Split
X = data.drop(columns=['Result'])
y = data['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Preprocessing Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_features)
])

## Baseline Random Forest Classifier
baseline_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

baseline_pipeline.fit(X_train, y_train)
y_pred = baseline_pipeline.predict(X_test)

# Baseline Evaluation
print("Baseline Random Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Ensemble Models & Hyperparameter Tuning
ensemble_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(ensemble_pipeline, param_grid, cv=3, scoring='accuracy', verbose=3)
grid_search.fit(X_train, y_train)

# Best Model Evaluation
y_pred_best = grid_search.best_estimator_.predict(X_test)
print("Best Model Results")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Classification Report:")
print(classification_report(y_test, y_pred_best))

# Future Enhancements
# - Investigate feature engineering for more meaningful input data.
# - Apply SMOTE for balancing classes.
# - Experiment with other models like XGBoost or LightGBM.
# - Analyze the impact of additional hyperparameter tuning for the best model.
