In [None]:
import pandas as pd
ml=pd.read_csv('../input/phishing-dataset-for-machine-learning/Phishing_Legitimate_full.csv')
ml.head()

In [None]:
ml.info()

In [None]:
unique=ml.nunique().sort_values(ascending=True).reset_index()
unique

In [None]:
categorical_features=unique[(unique[0] <10)]['index'].tolist()
categorical_features.remove('CLASS_LABEL')
high_cardinality_features=unique[(unique[0] >= 10) & (unique[0] <=20)]['index'].tolist()
numeric_features=unique[(unique[0] > 20)]['index'].tolist()
numeric_features.remove('id')

In [None]:
len(categorical_features),len(high_cardinality_features),len(numeric_features),(len(categorical_features)+len(high_cardinality_features)+len(numeric_features))

In [None]:
data = ml.sample(frac=0.8, random_state=42)
data_unseen = ml.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
ml.CLASS_LABEL.value_counts()

In [None]:
phishing = setup(data = data, target = 'CLASS_LABEL', session_id=42,
                  normalize = True, 
                  transformation = True, 
                  log_experiment = True,
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                  numeric_imputation='median',
                  categorical_imputation='mode',

                ignore_features=['id'],
                 high_cardinality_features=high_cardinality_features,
                 numeric_features=numeric_features
                )

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
print(lightgbm)

In [None]:
tuned_lightgbm = tune_model(lightgbm,optimize = 'Precision')

In [None]:
plot_model(tuned_lightgbm, plot = 'auc')

In [None]:
plot_model(tuned_lightgbm, plot = 'pr')

In [None]:
plot_model(tuned_lightgbm, plot='feature')

In [None]:
plot_model(tuned_lightgbm, plot = 'confusion_matrix')

In [None]:
evaluate_model(tuned_lightgbm)

In [None]:
predict_model(tuned_lightgbm)

In [None]:
unseen_predictions = predict_model(tuned_lightgbm, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))#Confidence Score