# 1. Data Acquisition:

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
df = pd.read_csv('malicious_phish.csv')
df

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [3]:
df['type'].unique()

array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

In [4]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 2. Data Cleaning and Preprocessing:

In [6]:
def simple_preprocess(url):
    if pd.isna(url):
        return ""
    url = url.replace('https://', '').replace('http://', '').replace('www.', '')
    return url.lower().strip()

df['clean_url'] = df['url'].apply(simple_preprocess)

# 3. Feature Engineering:

In [7]:
def extract_simple_features(url):
    features = {}
    features['length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_slashes'] = url.count('/')
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['has_https'] = 1 if 'https' in url else 0
    return features

features_list = []
for url in df['clean_url']:
    features_list.append(extract_simple_features(url))
    
features_df = pd.DataFrame(features_list)
df = pd.concat([df, features_df], axis=1)

In [8]:
X = df['clean_url']
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Feature Selection and Dimentionality Reduction

In [9]:
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 5. Model Selection and Training:

In [10]:
clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X_train_vec, y_train)

# 6. Model Evaluation:

In [11]:
y_pred = clf.predict(X_test_vec)

print("Accuracy:", (y_pred == y_test).mean())
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8476339652485047

Classification Report:
              precision    recall  f1-score   support

      benign       0.85      0.96      0.90     85621
  defacement       0.91      0.85      0.88     19292
     malware       0.96      0.79      0.87      6504
    phishing       0.64      0.36      0.46     18822

    accuracy                           0.85    130239
   macro avg       0.84      0.74      0.78    130239
weighted avg       0.84      0.85      0.83    130239



# 7. Threshold Tuning and Finalization:

In [12]:
def predict_url_with_threshold(url, threshold=0.4):
    clean_url = simple_preprocess(url)

    # Dash heuristic: classify as phishing if '-' present
    if '-' in clean_url:
        return {'url': url, 'prediction': 'phishing', 'confidence': 1.0}

    url_vec = vectorizer.transform([clean_url])
    proba = clf.predict_proba(url_vec)[0]
    classes = clf.classes_

    phishing_idx = list(classes).index('phishing')
    phishing_proba = proba[phishing_idx]

    if phishing_proba >= threshold:
        return {'url': url, 'prediction': 'phishing', 'confidence': phishing_proba}
    else:
        # Predict best non-phishing class
        non_phish_classes = [c for c in classes if c != 'phishing']
        non_phish_probs = [proba[list(classes).index(c)] for c in non_phish_classes]
        max_idx = non_phish_probs.index(max(non_phish_probs))
        return {'url': url, 'prediction': non_phish_classes[max_idx], 'confidence': max(non_phish_probs)}


# 8. Serialization and Deployment:

In [13]:
import joblib

# Save your trained model and vectorizer
joblib.dump(clf, 'rf_phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Later load them in any notebook or app
clf = joblib.load('rf_phishing_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')


# 9. Real-Time Detection Logic:

In [14]:
def real_time_url_check(url, threshold=0.4):
    result = predict_url_with_threshold(url, threshold)

    if result['prediction'] == 'phishing':
        action = 'block'
        alert = f"Phishing URL detected: {url}"
    else:
        action = 'allow'
        alert = None

    return {
        'url': url,
        'action': action,
        'alert': alert    }

# Example test
test_urls = [
    "https://www.paypal.com/login",
    "www.amazon.com",
    "https://en.wikipedia.org/wiki/Main_Page",
    "http://br-icloud.com.br", 
    "secure-payment.com",
    "http://fake-paypal-login.com",
    "http://amazon-security-check.com",
    "http://paypal-secure-verify.com"
]

for url in test_urls:
    res = real_time_url_check(url)
    print(f"URL: {res['url']}, Action: {res['action']}\n")
    if res['alert']:
        print(f"Alert: {res['alert']}\n")


URL: https://www.paypal.com/login, Action: allow

URL: www.amazon.com, Action: allow

URL: https://en.wikipedia.org/wiki/Main_Page, Action: allow

URL: http://br-icloud.com.br, Action: block

Alert: Phishing URL detected: http://br-icloud.com.br

URL: secure-payment.com, Action: block

Alert: Phishing URL detected: secure-payment.com

URL: http://fake-paypal-login.com, Action: block

Alert: Phishing URL detected: http://fake-paypal-login.com

URL: http://amazon-security-check.com, Action: block

Alert: Phishing URL detected: http://amazon-security-check.com

URL: http://paypal-secure-verify.com, Action: block

Alert: Phishing URL detected: http://paypal-secure-verify.com

