In [10]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib

In [11]:
# Replace 'phishing_dataset.csv' with your dataset filepath
df = pd.read_csv('phishing_urls.csv')

# Inspect first rows and missing values
print(df.head())
print(df.isnull().sum())

# Remove duplicates if any
df.drop_duplicates(inplace=True)
print("Dataset shape after dropping duplicates:", df.shape)


                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
4  thewhiskeydregs.com/wp-content/themes/widescre...   bad
URL      0
Label    0
dtype: int64
Dataset shape after dropping duplicates: (507196, 2)


In [16]:
# 1. Check the columns in your DataFrame
print("Columns in dataframe:", df.columns)

# 2. Optional: Strip any whitespace in your column names
df.columns = df.columns.str.strip()

# 3. Now access URL and Label columns using the exact column names
df['clean_url'] = df['URL'].apply(clean_url)  # Replace 'URL' if your column name differs
df['label_encoded'] = df['Label'].map({'good': 1, 'bad': 0})  # Replace 'Label' accordingly

# 4. Verify
print(df[['clean_url', 'label_encoded']].head())


Columns in dataframe: Index(['URL', 'Label'], dtype='object')
                                           clean_url  label_encoded
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...              0
1  dghjdgf.com/paypal.co.uk/cycgi-bin/webscrcmd h...              0
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....              0
3  mail.printakid.com/online.americanexpress.com/...              0
4  thewhiskeydregs.com/wp-content/themes/widescre...              0


In [18]:
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_features=5000)

X = vectorizer.fit_transform(df['clean_url'])
y = df['label_encoded']


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [20]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
print("Training: XGBoost")
xgb_model.fit(X_train, y_train)


Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [22]:
y_pred = xgb_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

print(classification_report(y_test, y_pred, target_names=['Phishing Link', 'Safe to Browse']))


Accuracy: 0.9511
                precision    recall  f1-score   support

 Phishing Link       0.93      0.84      0.89     22860
Safe to Browse       0.96      0.98      0.97     78580

      accuracy                           0.95    101440
     macro avg       0.94      0.91      0.93    101440
  weighted avg       0.95      0.95      0.95    101440



In [23]:
joblib.dump(xgb_model, 'xgboost_phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.
