In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import requests

In [22]:
dataset = pd.read_csv("/content/uci-ml-phishing-dataset.csv")
dataset = dataset.drop('id', axis=1)  

In [23]:
x = dataset.drop('Result', axis=1).values  
y = dataset['Result'].values  


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [24]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(x_train, y_train)

# Training accuracy
y_pred_train = classifier.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", accuracy_train)

# Testing accuracy
y_pred_test = classifier.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", accuracy_test)

Training Accuracy: 0.9906151062867481
Testing Accuracy: 0.9665309814563546


In [25]:
def extract_features(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            features = [
                url.count('-'),
                url.count('@'),
                url.count('//'),
                url.count('.'),
                len(url),
                url.startswith('https://'),
                url.startswith('http://'),
                url.count('?'),
                url.count('='),
                url.count('.com'),
                url.count('-'),
                url.count('.'),
                url.count('www.'),
                url.count('https'),
                url.count('http'),
                url.count('//'),
                url.count('.com'),
                url.count('.org'),
                url.count('.net'),
                url.count('.info'),
                url.count('.biz'),
                url.count('.gov'),
                url.count('.edu'),
                url.count('.mil'),
                url.count('.int'),
                url.count('.eu'),
                url.count('.tv'),
                url.count('.us'),
                url.count('.cc'),
                url.count('.name')
            ]
            return features
    except requests.exceptions.RequestException:
        pass
    return None

In [26]:
urls = [
    'https://www.google.com/',
    'http://www.google.com.test-security.tk/',
    'https://www.paypal.com/signin',
    'https://www.paypai.com/signin',
    'https://github.com/Santhoshnov/Browsecure',
    'https://gethub.com/Santhoshnov/Browsecure'
    ]

In [27]:
# Predict the URLs
for url in urls:
    features = extract_features(url)
    if features is None:
        print(f"{url}: Phishing")
    else:
        prediction = classifier.predict([features])[0]
        result = 'Legitimate' if prediction == 1 else 'Phishing'
        print(f"{url}: {result}")


y_pred_test = classifier.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", accuracy_test)

https://www.google.com/: Legitimate
http://www.google.com.test-security.tk/: Phishing
https://www.paypal.com/signin: Legitimate
https://www.paypai.com/signin: Phishing
https://github.com/Santhoshnov/Browsecure: Legitimate
https://gethub.com/Santhoshnov/Browsecure: Phishing
Testing Accuracy: 0.9665309814563546
