In [1]:
pip install tldextract

Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
import tldextract

# Define a list of common phishing-related keywords
phishing_keywords = ['login', 'secure', 'account', 'verify', 'update', 'bank', 'password', 'payment']

# Define a list of known suspicious domain extensions
suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq']  # Example suspicious TLDs

# Define a list of known phishing domains (for demo purposes)
known_phishing_domains = ['bad-website.tk', 'malicious-site.ga']

# Function to extract features from a URL
def extract_features(url):
    features = {}

    # Check for suspicious characters in the URL
    features['has_at_symbol'] = '@' in url
    features['has_double_slash'] = '//' in url.strip().lstrip('http:').lstrip('https:')
    features['has_ip_address'] = bool(re.match(r'http[s]?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))
    
    # Extract domain and TLD
    ext = tldextract.extract(url)
    features['domain'] = ext.domain
    features['subdomain'] = ext.subdomain
    features['tld'] = ext.suffix

    return features

# Function to determine if a URL is phishing
def is_phishing(url):
    features = extract_features(url)

    # Basic checks for phishing characteristics
    if features['has_at_symbol'] or features['has_double_slash'] or features['has_ip_address']:
        return True

    # Check if the TLD is suspicious
    if features['tld'] in suspicious_tlds:
        return True

    # Check for known phishing domains
    full_domain = f"{features['subdomain']}.{features['domain']}.{features['tld']}"
    if full_domain in known_phishing_domains:
        return True

    # Check for common phishing-related keywords in the domain or path
    for keyword in phishing_keywords:
        if keyword in url.lower():
            return True

    return False

# Test the program with some example URLs
test_urls = [
    'https://secure-login.payment.tk',  # Suspicious domain and TLD
    'http://192.168.0.1/admin',  # IP address
    'https://www.example.com',  # Legitimate URL
    'http://www.bank-update.com',  # Contains phishing keyword
]

# Check each URL to see if it's a potential phishing site
for url in test_urls:
    if is_phishing(url):
        print(f"The URL '{url}' is likely a phishing site.")
    else:
        print(f"The URL '{url}' seems safe.")


The URL 'https://secure-login.payment.tk' is likely a phishing site.
The URL 'http://192.168.0.1/admin' is likely a phishing site.
The URL 'https://www.example.com' is likely a phishing site.
The URL 'http://www.bank-update.com' is likely a phishing site.


In [4]:
import re
import tldextract

# Define a list of common phishing-related keywords
phishing_keywords = ['login', 'secure', 'account', 'verify', 'update', 'bank', 'password', 'payment']

# Define a list of known suspicious domain extensions
suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq']  # Example suspicious TLDs

# Define a list of known phishing domains (for demo purposes)
known_phishing_domains = ['bad-website.tk', 'malicious-site.ga']

# Function to extract features from a URL
def extract_features(url):
    features = {}

    # Check for suspicious characters in the URL
    features['has_at_symbol'] = '@' in url
    features['has_double_slash'] = '//' in url.strip().lstrip('http:').lstrip('https:')
    features['has_ip_address'] = bool(re.match(r'http[s]?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))
    
    # Extract domain and TLD
    ext = tldextract.extract(url)
    features['domain'] = ext.domain
    features['subdomain'] = ext.subdomain
    features['tld'] = ext.suffix

    return features

# Function to determine if a URL is phishing
def is_phishing(url):
    features = extract_features(url)

    # Basic checks for phishing characteristics
    if features['has_at_symbol'] or features['has_double_slash'] or features['has_ip_address']:
        return True

    # Check if the TLD is suspicious
    if features['tld'] in suspicious_tlds:
        return True

    # Check for known phishing domains
    full_domain = f"{features['subdomain']}.{features['domain']}.{features['tld']}"
    if full_domain in known_phishing_domains:
        return True

    # Check for common phishing-related keywords in the domain or path
    for keyword in phishing_keywords:
        if keyword in url.lower():
            return True

    return False

# Main function to run the phishing detection
def main():
    # Get URL input from the user
    user_url = input("Please enter a URL to check for phishing: ")

    # Check if the URL is likely phishing
    if is_phishing(user_url):
        print(f"The URL '{user_url}' is likely a phishing site.")
    else:
        print(f"The URL '{user_url}' seems safe.")

# Run the main function
if __name__ == "__main__":
    main()


Please enter a URL to check for phishing:  https://www.facebook.com/


The URL 'https://www.facebook.com/' is likely a phishing site.


In [6]:
import re
import tldextract

# Known phishing-related keywords (reduced list to avoid false positives)
phishing_keywords = ['verify', 'update', 'secure']

# Suspicious TLDs (remove common legitimate ones)
suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq']

# Known phishing domains
known_phishing_domains = ['bad-website.tk', 'malicious-site.ga']

# Function to extract features from a URL
def extract_features(url):
    features = {}

    # Check for suspicious characters in the URL
    features['has_at_symbol'] = '@' in url
    features['has_double_slash'] = '//' in url.strip().lstrip('http:').lstrip('https:')
    features['has_ip_address'] = bool(re.match(r'http[s]?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))
    
    # Extract domain and TLD
    ext = tldextract.extract(url)
    features['domain'] = ext.domain
    features['subdomain'] = ext.subdomain
    features['tld'] = ext.suffix

    return features

# Function to determine if a URL is phishing
def is_phishing(url):
    features = extract_features(url)

    # Check for basic phishing characteristics
    if features['has_at_symbol'] or features['has_ip_address']:
        return True

    # Check if the TLD is suspicious
    if features['tld'] in suspicious_tlds:
        return True

    # Check for known phishing domains
    full_domain = f"{features['subdomain']}.{features['domain']}.{features['tld']}"
    if full_domain in known_phishing_domains:
        return True

    # Check for specific phishing-related keywords in the domain or path
    # Avoid false positives by using a narrower keyword list
    for keyword in phishing_keywords:
        if keyword in url.lower():
            return True

    return False

# Main function to run the phishing detection
def main():
    user_url = input("Please enter a URL to check for phishing: ")

    if is_phishing(user_url):
        print(f"The URL '{user_url}' might be a phishing site.")
    else:
        print(f"The URL '{user_url}' seems safe.")

if __name__ == "__main__":
    main()


Please enter a URL to check for phishing:  https://altoro.testfire.net/


The URL 'https://altoro.testfire.net/' seems safe.


In [10]:
import re
import tldextract

# Define a list of common phishing-related keywords (for heuristic checks)
phishing_keywords = ['verify', 'update', 'secure']

# Define a list of suspicious TLDs
suspicious_tlds = ['tk', 'ml', 'ga', 'cf', 'gq']  # Example suspicious TLDs

# List of known phishing domains (including the new one you mentioned)
known_phishing_domains = [
    'att-103234.weeblysite.com',
    'mntamasklogin.gitbook.io',
    'anmilopbe.blogspot.co.id',
    'ligne-orange4.godaddysites.com',
    'telstra-108709.weeblysite.com',
    'bt-105132.weeblysite.com',
    'yahoo-mail-109605.weeblysite.com',
    'bafybeiaylac7v34xccdujkx5l4ulnwfgq7nbwxux2ntsxz2hemzo75ox3y.ipfs.cf-ipfs.com',
    'mail-109392.weeblysite.com',
    'steveoffice.pages.dev',
    'mako-nordic.com',
    'generatorfreeaccounts.blogspot.my',
    'lacasadelcuarzo.com',
    'pub-b090674c9132409c92022d05de5e8ca4.r2.dev/index.html',
    'quasukienff.grarena.vn',
    'netzero-106257.weeblysite.com',
    'document.mamabiller59.workers.dev',
    'serverouttnethicationdomainservicesmails02.pages.dev',
    'ow0ohrbcaz28.optimytool.com',
    'we-zc-ash.com',
    'terimaa-hadiiah-giveaaway.resmiii-vippp.my.id',
    'piv-cfe.pages.dev/07e55d85-a0b3-4906-8e89-327bd8a209d9',
    'barclaycard-pvn.de',
    'netflix-clone-1p2.pages.dev',
    'id-1924472.page-hotels.top/p/767242222',
    'bafybeifadngqcxjj5qer7of2h4enrzufunfen4oolr25pixwyerf6mzjam.ipfs.cf-ipfs.com',
    'lescondimentsdahoe.com',
    'floral-king-ebb0.tangguayhomes.workers.dev',
    'pub-a800c74c40594acb86f6637cf180050a.r2.dev/index.html',
    'bafybeickmtazhjxetzymqkorn4rx24weds5qxk6fpvef2dd6e4uuwqkz3a.ipfs.cf-ipfs.com',
]

# Function to extract features from a URL
def extract_features(url):
    features = {}

    # Check for suspicious characters
    features['has_at_symbol'] = '@' in url
    features['has_double_slash'] = '//' in url.strip().lstrip('http:').lstrip('https:')
    features['has_ip_address'] = bool(re.match(r'http[s]?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))

    # Extract domain and TLD
    ext = tldextract.extract(url)
    features['domain'] = ext.domain
    features['subdomain'] = ext.subdomain
    features['tld'] = ext.suffix

    return features

# Function to determine if a URL is phishing
def is_phishing(url):
    features = extract_features(url)

    # Basic checks for phishing characteristics
    if features['has_at_symbol'] or features['has_double_slash'] or features['has_ip_address']:
        return True

    # Check if the TLD is suspicious
    if features['tld'] in suspicious_tlds:
        return True

    # Check if the domain is in the list of known phishing domains
    full_domain = f"{features['subdomain']}.{features['domain']}.{features['tld']}"
    if full_domain in known_phishing_domains:
        return True

    # Check for specific phishing-related keywords in the domain or path
    for keyword in phishing_keywords:
        if keyword in url.lower():
            return True

    return False

# Main function to run the phishing detection
def main():
    user_url = input("Please enter a URL to check for phishing: ")

    if is_phishing(user_url):
        print(f"The URL '{user_url}' is likely a phishing site.")
    else:
        print(f"The URL '{user_url}' seems safe.")

if __name__ == "__main__":
    main()


Please enter a URL to check for phishing:  https://www.facebook.com


The URL 'https://www.facebook.com' is likely a phishing site.


In [11]:
pip install pandas scikit-learn tldextract

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import re
import tldextract
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to extract features from a URL
def extract_features(url):
    features = {}

    # Check for suspicious characters
    features['has_at_symbol'] = '@' in url
    features['has_double_slash'] = '//' in url.strip().lstrip('http:').lstrip('https:')
    features['has_ip_address'] = bool(re.match(r'http[s]?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))

    # Extract domain and TLD
    ext = tldextract.extract(url)
    features['domain_length'] = len(ext.domain)
    features['subdomain_count'] = len(ext.subdomain.split('.'))
    features['tld'] = ext.suffix

    # Check for common phishing-related keywords
    phishing_keywords = ['verify', 'update', 'secure', 'login', 'account']
    features['contains_keyword'] = any(keyword in url.lower() for keyword in phishing_keywords)

    return list(features.values())

# Load data from CSV
data = pd.read_csv('dataset_phishing.csv')

# Extract features for each URL in the dataset
data['features'] = data['URL'].apply(extract_features)

# Split features and labels
X = pd.DataFrame(data['features'].tolist())  # Features are in list form
y = data['Label']  # Labels: 1 for phishing, 0 for non-phishing

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2%}")

# Function to classify a new URL
def classify_url(url):
    features = extract_features(url)
    prediction = model.predict([features])
    if prediction[0] == 1:
        return f"The URL '{url}' is likely a phishing site."
    else:
        return f"The URL '{url}' seems safe."

# Test the classification with a user-provided URL
user_url = input("Enter a URL to check for phishing: ")
print(classify_url(user_url))


KeyError: 'URL'

In [6]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data from CSV and display the first few rows
data = pd.read_csv('your_file.csv')
print("Data preview:")
print(data.head())

# Strip any leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Check if necessary columns exist
required_columns = ['url', 'status']  # 'status' is the target variable
missing_columns = [col for col in required_columns if col not in data.columns]

if missing_columns:
    raise KeyError(f"Missing required columns: {missing_columns}")

# Features and target variable
X = data.drop(['status', 'url'], axis=1)  # Drop 'status' and 'url' from features
y = data['status']  # 'status' is the target variable (1 = phishing, 0 = non-phishing)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2%}")

# Function to classify a new URL based on extracted features
def classify_url(url):
    # Create a dictionary of features with default values
    default_features = {
        'length_url': 0,
        'length_hostname': 0,
        'ip': 0,
        'nb_dots': 0,
        'nb_hyphens': 0,
        # Add the rest of the features with default or calculated values
        # ...
    }
    
    # Assume a simple extraction logic for length_url
    default_features['length_url'] = len(url)

    # Extract specific features from the given URL (as per your headers)
    # Example: number of dots in the URL
    default_features['nb_dots'] = url.count('.')

    # Transform the dictionary into a DataFrame to match the training data structure
    feature_vector = pd.DataFrame([default_features])

    # Predict using the trained model
    prediction = model.predict(feature_vector)

    return prediction[0]  # 1 = phishing, 0 = non-phishing

# Test the classification with a user-provided URL
user_url = input("Enter a URL to check for phishing: ")
is_phishing = classify_url(user_url)

if is_phishing:
    print(f"The URL '{user_url}' is likely a phishing site.")
else:
    print(f"The URL '{user_url}' seems safe.")


FileNotFoundError: [Errno 2] No such file or directory: 'your_file.csv'