# NOTEBOOK: URL PHISHING MODEL (RAW URL)

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import joblib


## Load dataset

In [2]:
df = pd.read_csv("/kaggle/input/urldataset/data.csv")

df.head()


Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [3]:
df["label"] = df["label"].map({
    "bad": 1,     # phishing
    "good": 0     # legitimate
})

df["label"].value_counts()


label
0    344821
1     75643
Name: count, dtype: int64

In [4]:
def extract_url_features(url: str):
    url = url.lower()

    suspicious_words = [
    "login", "verify", "update", "secure", "account",
    "free", "bonus", "reward", "claim",
    "kyc", "blocked", "suspend"
]


    return {
        # Long URLs hide malicious intent
        "url_length": len(url),

        # Many dots = many subdomains (phishing trick)
        "dot_count": url.count("."),

        # Random digits often appear in fake domains
        "digit_count": sum(c.isdigit() for c in url),

        # Hyphens are common in fake URLs
        "hyphen_count": url.count("-"),

        # Deep paths often used to mimic real pages
        "slash_count": url.count("/"),

        # HTTPS absence is suspicious
        "https_present": int(url.startswith("https")),

        # '@' symbol redirects users
        "at_symbol": int("@" in url),

        # IP-based URLs are highly suspicious
        "ip_present": int(url.replace(".", "").isdigit()),

        # Count of scam-related keywords
        "suspicious_word_present": int(any(word in url for word in suspicious_words))

    }


In [5]:
X = df["url"].apply(extract_url_features)
X = pd.DataFrame(list(X))

y = df["label"]

X.head()


Unnamed: 0,url_length,dot_count,digit_count,hyphen_count,slash_count,https_present,at_symbol,ip_present,suspicious_word_present
0,22,1,0,0,0,0,0,0,0
1,16,2,0,0,0,0,0,0,0
2,18,1,0,0,0,0,0,0,0
3,13,1,0,0,0,0,0,0,0
4,21,1,0,0,0,0,0,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # VERY IMPORTANT
    n_jobs=-1
)

model.fit(X_train, y_train)


In [8]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6870964289536584
              precision    recall  f1-score   support

           0       0.89      0.71      0.79     68964
           1       0.31      0.59      0.40     15129

    accuracy                           0.69     84093
   macro avg       0.60      0.65      0.60     84093
weighted avg       0.78      0.69      0.72     84093



In [9]:
feature_importance = pd.Series(
    model.coef_[0],
    index=X.columns
).sort_values(ascending=False)

feature_importance


at_symbol                  2.801006
suspicious_word_present    2.032777
dot_count                  0.418791
slash_count                0.203240
ip_present                 0.140681
digit_count                0.056963
https_present              0.037600
url_length                -0.009413
hyphen_count              -0.405268
dtype: float64

In [10]:
joblib.dump(model, "url_phishing_model.pkl")
print(" URL phishing model trained & saved")


 URL phishing model trained & saved


In [11]:
def test_url(url):
    features = extract_url_features(url)
    X = pd.DataFrame([features])

    prob = model.predict_proba(X)[0][1]

    if prob >= 0.55:   # LOWER threshold
        return "ðŸš¨ Phishing", prob
    else:
        return "âœ… Legit", prob


test_urls = [
    "http://pm-kisan-benefit-verify.in/login",
    "http://sbi-secure-login-alert.com",
    "http://free-reward-claim-now.net",
    "https://www.google.com",
    "https://www.mahagov.in"
]

for u in test_urls:
    print(u, "â†’", test_url(u))


http://pm-kisan-benefit-verify.in/login â†’ ('ðŸš¨ Phishing', np.float64(0.6423089783051568))
http://sbi-secure-login-alert.com â†’ ('ðŸš¨ Phishing', np.float64(0.6079346951202268))
http://free-reward-claim-now.net â†’ ('ðŸš¨ Phishing', np.float64(0.6101760375150048))
https://www.google.com â†’ ('âœ… Legit', np.float64(0.5452764201013474))
https://www.mahagov.in â†’ ('âœ… Legit', np.float64(0.5452764201013474))
