In [1]:

import pandas as pd
import numpy as np
import re
import random


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/FakeUrl/malicious_phish.csv')
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [5]:
df['type'].value_counts(normalize=True) * 100


Unnamed: 0_level_0,proportion
type,Unnamed: 1_level_1
benign,65.741541
defacement,14.812398
phishing,14.452135
malware,4.993927


In [6]:
HOMOGLYPHS = {'a':'@','l':'I','o':'0','e':'3'}
SUBDOMAINS = ['login','secure','verify','account']
TLDS = ['.com','.net','.xyz','.ru','.info']

def mutate_url(url):
    ext = tldextract.extract(url)
    domain = ext.domain + '.' + ext.suffix
    mutations = set()

    # Homoglyph
    chars = list(domain)
    for i, c in enumerate(chars):
        if c in HOMOGLYPHS and random.random() < 0.4:
            chars[i] = HOMOGLYPHS[c]
    mutations.add(''.join(chars))

    # Subdomain injection
    mutations.add(random.choice(SUBDOMAINS) + '.' + domain)

    # TLD switch
    base = domain.split('.')[0]
    mutations.add(base + random.choice(TLDS))

    # Noise injection
    mutations.add(domain.replace('.', '--.'))

    return list(mutations)


In [17]:
augmentation_factor = {
    'defacement': 1,
    'phishing': 1,
    'malware': 5   # strong boost (minority class)
}


In [19]:
!pip install tldextract
import tldextract

aug_urls, aug_labels = [], []

for label, factor in augmentation_factor.items():
    class_df = df[df['type'] == label]

    for _, row in class_df.iterrows():
        for _ in range(factor):
            mutated_urls = mutate_url(row['url'])
            for m in mutated_urls:
                aug_urls.append(m)
                aug_labels.append(label)

aug_df = pd.DataFrame({
    'url': aug_urls,
    'type': aug_labels
})

Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-3.0.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-3.0.1-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-3.0.1 tldextract-5.3.0


In [21]:
final_df = pd.concat([df[['url','type']], aug_df], ignore_index=True)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [22]:
final_df['type'].value_counts(normalize=True) * 100


Unnamed: 0_level_0,proportion
type,Unnamed: 1_level_1
malware,32.94881
defacement,23.416262
phishing,22.802787
benign,20.832141


In [23]:
print("Original df size:", df.shape)
print("Augmented df size:", aug_df.shape)
print("Final df size:", final_df.shape)


Original df size: (651191, 2)
Augmented df size: (1403821, 2)
Final df size: (2055012, 2)


In [16]:
df['type'].unique()


array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

In [35]:
import tldextract
import math

def extract_features(url):
    url = str(url).lower()
    ext = tldextract.extract(url)

    hostname = ext.domain + '.' + ext.suffix if ext.domain else ""

    path = ""
    if ext.suffix and ext.suffix in url:
        path = url.split(ext.suffix)[-1]
    elif ext.domain and ext.domain in url:
        # If suffix is empty but domain exists, consider path after domain
        domain_index = url.find(ext.domain)
        path_start_index = domain_index + len(ext.domain)
        path = url[path_start_index:]
    else:
        # Fallback for URLs without clear domain/suffix or other structures
        # Consider everything after scheme and potential hostname part
        scheme_end = url.find('://')
        if scheme_end != -1:
            path_start = url.find('/', scheme_end + 3) # Find first slash after scheme and potential host
            if path_start != -1:
                path = url[path_start:]
            else:
                path = "/" # If no slash after scheme, treat as root
        else: # No scheme, just take the whole thing as path if no domain found
            path = url

    return [
        len(url),                             # URL length
        url.count('.'),
        url.count('-'),
        url.count('_'),
        url.count('/'),
        url.count('@'),
        sum(c.isdigit() for c in url),
        int(bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))),  # IP-based
        int(url.startswith('https')),
        len(ext.subdomain.split('.')) if ext.subdomain else 0,  # subdomains
        len(path),                             # path length
        int(any(w in url for w in ['login','verify','secure','account'])),
        int(any(w in url for w in ['admin','upload','config','shell'])),  # defacement hint
        int(any(w in url for w in ['free','win','bonus','gift'])),        # phishing hint
        url.count('%'),                        # encoding
        int('//' in url[8:]),                  # redirection
    ]

In [34]:
X = np.array([extract_features(u) for u in final_df['url']])
y = final_df['type'].values


In [36]:
X.shape, y.shape


((2055012, 16), (2055012,))

In [37]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

print(dict(zip(le.classes_, le.transform(le.classes_))))


{'benign': np.int64(0), 'defacement': np.int64(1), 'malware': np.int64(2), 'phishing': np.int64(3)}


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, X_seq_train, X_seq_test, y_train, y_test = train_test_split(
    X, X_seq, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'  # IMPORTANT
)

model.fit(X_train, y_train)


In [30]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

      benign       0.78      0.86      0.82     85621
  defacement       0.58      0.56      0.57     96241
     malware       0.84      0.74      0.78    135421
    phishing       0.46      0.51      0.49     93720

    accuracy                           0.67    411003
   macro avg       0.66      0.67      0.66    411003
weighted avg       0.68      0.67      0.67    411003



In [39]:
!pip install xgboost




In [40]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    n_estimators=400,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    tree_method='hist',
    random_state=42
)

xgb.fit(X_train, y_train)


In [41]:
from sklearn.metrics import classification_report

y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

      benign       0.95      0.96      0.95     85621
  defacement       0.62      0.68      0.65     96241
     malware       0.79      0.80      0.79    135421
    phishing       0.54      0.47      0.51     93720

    accuracy                           0.73    411003
   macro avg       0.72      0.73      0.73    411003
weighted avg       0.73      0.73      0.73    411003



In [43]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Character-level tokenizer
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(final_df['url'])

sequences = tokenizer.texts_to_sequences(final_df['url'])
X_seq = pad_sequences(sequences, maxlen=200, padding='post')


In [44]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

vocab_size = len(tokenizer.word_index) + 1

input_seq = Input(shape=(200,))
x = Embedding(vocab_size, 64)(input_seq)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
cnn_output = Dense(64, activation='relu')(x)

cnn_model = Model(inputs=input_seq, outputs=cnn_output)


In [45]:
from tensorflow.keras.layers import Concatenate

# Handcrafted features input
input_feat = Input(shape=(X.shape[1],))

combined = Concatenate()([cnn_output, input_feat])
z = Dense(128, activation='relu')(combined)
z = Dropout(0.5)(z)
final_output = Dense(4, activation='softmax')(z)

hybrid_model = Model(inputs=[input_seq, input_feat], outputs=final_output)


In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, X_seq_train, X_seq_test, y_train, y_test = train_test_split(
    X,
    X_seq,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [48]:
hybrid_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

hybrid_model.fit(
    [X_seq_train, X_train],
    y_train,
    validation_split=0.1,
    batch_size=256,
    epochs=5
)


Epoch 1/5
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 9ms/step - accuracy: 0.7895 - loss: 0.5860 - val_accuracy: 0.9344 - val_loss: 0.1890
Epoch 2/5
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 8ms/step - accuracy: 0.9362 - loss: 0.1945 - val_accuracy: 0.9523 - val_loss: 0.1422
Epoch 3/5
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9488 - loss: 0.1577 - val_accuracy: 0.9579 - val_loss: 0.1239
Epoch 4/5
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9551 - loss: 0.1395 - val_accuracy: 0.9622 - val_loss: 0.1131
Epoch 5/5
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9587 - loss: 0.1278 - val_accuracy: 0.9644 - val_loss: 0.1069


<keras.src.callbacks.history.History at 0x78f780e299a0>

In [49]:
from tensorflow.keras.callbacks import EarlyStopping


In [50]:
early_stop = EarlyStopping(
    monitor='val_loss',       # watch validation loss
    patience=2,               # stop if no improvement for 2 epochs
    restore_best_weights=True # VERY IMPORTANT
)


In [51]:
hybrid_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


In [52]:
history = hybrid_model.fit(
    [X_seq_train, X_train],
    y_train,
    validation_split=0.1,
    batch_size=256,
    epochs=20,          # allow more epochs
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 8ms/step - accuracy: 0.9613 - loss: 0.1203 - val_accuracy: 0.9671 - val_loss: 0.0960
Epoch 2/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 8ms/step - accuracy: 0.9632 - loss: 0.1131 - val_accuracy: 0.9673 - val_loss: 0.0960
Epoch 3/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9650 - loss: 0.1083 - val_accuracy: 0.9710 - val_loss: 0.0865
Epoch 4/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9666 - loss: 0.1025 - val_accuracy: 0.9713 - val_loss: 0.0849
Epoch 5/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9673 - loss: 0.1004 - val_accuracy: 0.9704 - val_loss: 0.0861
Epoch 6/20
[1m5780/5780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8ms/step - accuracy: 0.9688 - loss: 0.0966 - val_accuracy: 0.9726 - val_loss: 0.0826
Epoch 7/20

In [53]:
y_pred = hybrid_model.predict([X_seq_test, X_test])
y_pred = y_pred.argmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le.classes_))


[1m12844/12844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step
              precision    recall  f1-score   support

      benign       0.98      0.99      0.98     85621
  defacement       0.99      0.99      0.99     96241
     malware       0.99      0.97      0.98    135421
    phishing       0.94      0.96      0.95     93720

    accuracy                           0.98    411003
   macro avg       0.97      0.98      0.98    411003
weighted avg       0.98      0.98      0.98    411003



In [55]:
def predict_url(url):
    # 1️⃣ Handcrafted features
    feat = extract_features(url)
    feat = np.array(feat).reshape(1, -1)

    # 2️⃣ Character-level sequence
    seq = tokenizer.texts_to_sequences([url])
    seq = pad_sequences(seq, maxlen=200, padding='post')

    # 3️⃣ Hybrid model prediction
    probs = hybrid_model.predict([seq, feat])[0]

    # 4️⃣ Decode output
    pred_class = np.argmax(probs)
    pred_label = le.inverse_transform([pred_class])[0]
    risk_score = int(np.max(probs) * 100)

    return {
        "url": url,
        "prediction": pred_label,
        "risk_score": risk_score,
        "class_probabilities": dict(zip(le.classes_, probs))
    }


In [56]:
predict_url("https://www.amazon.com")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 686ms/step


{'url': 'http://paypaI-secure-login-update.xyz',
 'prediction': 'phishing',
 'risk_score': 99,
 'class_probabilities': {'benign': np.float32(1.6733842e-11),
  'defacement': np.float32(8.338429e-05),
  'malware': np.float32(0.0006990576),
  'phishing': np.float32(0.9992175)}}

In [57]:
predict_url("https://www.amazon.com")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


{'url': 'https://www.amazon.com',
 'prediction': 'phishing',
 'risk_score': 99,
 'class_probabilities': {'benign': np.float32(1.3271576e-15),
  'defacement': np.float32(2.1028896e-09),
  'malware': np.float32(0.0002742653),
  'phishing': np.float32(0.99972576)}}

In [58]:
predict_url("https://www.amazon.dom")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


{'url': 'https://www.amazon.dom',
 'prediction': 'phishing',
 'risk_score': 99,
 'class_probabilities': {'benign': np.float32(4.6952212e-14),
  'defacement': np.float32(3.0463667e-09),
  'malware': np.float32(0.00023682395),
  'phishing': np.float32(0.9997632)}}

In [59]:
predict_url("https://www.amazon1.dom")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


{'url': 'https://www.amazon1.dom',
 'prediction': 'phishing',
 'risk_score': 99,
 'class_probabilities': {'benign': np.float32(5.726577e-13),
  'defacement': np.float32(9.432622e-09),
  'malware': np.float32(0.00045776277),
  'phishing': np.float32(0.9995422)}}