In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [17]:
df=pd.read_csv("dataset_phishing.csv")

In [18]:
df.head()


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [19]:
# target feature mapping

df['status'] = df['status'].map({'phishing': 1, 'legitimate': 0})

In [20]:
X = df.drop(columns=['url', 'status'])  
y = df['status']  

In [21]:
df.url

0                    http://www.crestonwood.com/router.php
1        http://shadetreetechnology.com/V4/validation/a...
2        https://support-appleld.com.secureupdate.duila...
3                                       http://rgipt.ac.in
4        http://www.iracing.com/tracks/gateway-motorspo...
                               ...                        
11425        http://www.fontspace.com/category/blackletter
11426    http://www.budgetbots.com/server.php/Server%20...
11427    https://www.facebook.com/Interactive-Televisio...
11428               http://www.mypublicdomainpictures.com/
11429    http://174.139.46.123/ap/signin?openid.pape.ma...
Name: url, Length: 11430, dtype: object

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [24]:
y_pred = rf_model.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report_output)

Model Accuracy: 96.76%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1732
           1       0.97      0.96      0.97      1697

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429



In [26]:
pip install numpy


Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install whois

Note: you may need to restart the kernel to use updated packages.


In [28]:
import re
import urllib.parse as urlparse
from datetime import datetime
import numpy as np
import socket
import whois

In [29]:
import re
import socket
import urllib.parse as urlparse
from datetime import datetime
import whois

def extract_features(url):
    """
    Extract all 89 features from the URL.
    This function replicates the feature extraction used in the original dataset.
    """
    features = {}

    # Parsing the URL
    parsed_url = urlparse.urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path
    query = parsed_url.query

    # 1. URL Length
    features['length_url'] = len(url)
    
    # 2. Number of Dots
    features['nb_dots'] = url.count('.')
    
    # 3. Number of Hyphens
    features['nb_hyphens'] = url.count('-')
    
    # 4. Number of '@' symbols
    features['nb_at'] = url.count('@')
    
    # 5. Number of Question Marks
    features['nb_qm'] = url.count('?')
    
    # 6. Number of '&' symbols
    features['nb_and'] = url.count('&')
    
    # 7. Number of 'or' keywords
    features['nb_or'] = url.lower().count('or')
    
    # 8. Number of '=' symbols
    features['nb_eq'] = url.count('=')
    
    # 9. Number of Underscores
    features['nb_underscore'] = url.count('_')
    
    # 10. Number of Tildes (~)
    features['nb_tilde'] = url.count('~')
    
    # 11. Number of Percent (%) symbols
    features['nb_percent'] = url.count('%')
    
    # 12. Number of Slashes
    features['nb_slash'] = url.count('/')
    
    # 13. Number of Asterisks (*)
    features['nb_star'] = url.count('*')
    
    # 14. Number of Colons (:)
    features['nb_colon'] = url.count(':')
    
    # 15. Number of Commas
    features['nb_comma'] = url.count(',')
    
    # 16. Number of Semicolons
    features['nb_semicolumn'] = url.count(';')
    
    # 17. Number of Dollar ($) symbols
    features['nb_dollar'] = url.count('$')
    
    # 18. Number of Spaces
    features['nb_space'] = url.count(' ')
    
    # 19. Number of "www" occurrences
    features['nb_www'] = url.lower().count('www')
    
    # 20. Number of ".com" occurrences
    features['nb_com'] = url.lower().count('.com')
    
    # 21. Number of "//" occurrences
    features['nb_dslash'] = url.count('//')
    
    # 22. Presence of "http" in path
    features['http_in_path'] = int('http' in path.lower())
    
    # 23. Presence of "https" token in domain
    features['https_token'] = int('https' in domain.lower())
    
    # 24. Ratio of digits in URL
    features['ratio_digits_url'] = sum(c.isdigit() for c in url) / len(url)
    
    # 25. Ratio of digits in domain
    features['ratio_digits_host'] = sum(c.isdigit() for c in domain) / len(domain)
    
    # 26. Presence of punycode in domain
    features['punycode'] = int('xn--' in domain.lower())
    
    # 27. Presence of port number in URL
    features['port'] = int(':' in domain)
    
    # 28. TLD in path
    features['tld_in_path'] = int(any(tld in path.lower() for tld in ['.com', '.net', '.org', '.info', '.biz','.in']))
    
    # 29. TLD in subdomain
    features['tld_in_subdomain'] = int(any(tld in domain.lower() for tld in ['.com', '.net', '.org', '.info', '.biz','.in']))
    
    # 30. Abnormal subdomain
    features['abnormal_subdomain'] = int(len(domain.split('.')) > 3)
    
    # 31. Number of subdomains
    features['nb_subdomains'] = len(domain.split('.')) - 1
    
    # 32. Prefix/Suffix with "-"
    features['prefix_suffix'] = int('-' in domain)
    
    # 33. Random domain name
    features['random_domain'] = int(bool(re.search(r'[a-zA-Z]{8,}', domain)))
    
    # 34. Shortening service in domain
    shortening_services = r"bit\.ly|goo\.gl|tinyurl\.com|ow\.ly|t\.co"
    features['shortening_service'] = int(bool(re.search(shortening_services, domain)))
    
    # 35. Path extension
    features['path_extension'] = int(bool(re.search(r'\.\w{2,4}$', path)))
    
    # 36. Number of redirections ("//" in path)
    features['nb_redirection'] = path.count('//')
    
    # 37. Number of external redirections
    features['nb_external_redirection'] = int('http' in query.lower() or 'https' in query.lower())
    
    # 38. Length of raw words in URL
    words_raw = re.split(r'\W+', url)
    features['length_words_raw'] = sum(len(word) for word in words_raw)
    
    # 39. Character repetition in URL
    features['char_repeat'] = max([url.count(c) for c in set(url)])
    
    # 40. Length of shortest word in URL
    features['shortest_words_raw'] = min([len(word) for word in words_raw]) if words_raw else None
    
    # 41. Length of shortest word in domain
    features['shortest_word_host'] = min([len(word) for word in domain.split('.')]) if domain else None
    
    # 42. Length of shortest word in path
    features['shortest_word_path'] = min([len(word) for word in path.split('/')]) if path else None
    
    # 43. Length of longest word in URL
    features['longest_words_raw'] = max([len(word) for word in words_raw]) if words_raw else None
    
    # 44. Length of longest word in domain
    features['longest_word_host'] = max([len(word) for word in domain.split('.')]) if domain else None
    
    # 45. Length of longest word in path
    features['longest_word_path'] = max([len(word) for word in path.split('/')]) if path else None
    
    # 46. Average word length in URL
    features['avg_words_raw'] = features['length_words_raw'] / len(words_raw) if words_raw else None
    
    # 47. Average word length in domain
    words_host = domain.split('.')
    features['avg_word_host'] = sum(len(word) for word in words_host) / len(words_host) if words_host else None
    
    # 48. Average word length in path
    words_path = path.split('/')
    features['avg_word_path'] = sum(len(word) for word in words_path) / len(words_path) if words_path else None
    
    # 49. Phishing hints in URL
    features['phish_hints'] = int('login' in url.lower() or 'secure' in url.lower() or 'libgen' in url.lower() )
    
    # 50. Domain in brand
    features['domain_in_brand'] = int('paypal' in domain.lower() or 'apple' in domain.lower() or 'airtel' in domain.lower())
    
    # 51. Brand in subdomain
    features['brand_in_subdomain'] = int('paypal' in domain.split('.')[0].lower() or 'apple' in domain.split('.')[0].lower() )
    
    # 52. Brand in path
    features['brand_in_path'] = int('paypal' in path.lower() or 'apple' in path.lower()or 'airtel' in path.lower())
    
    # 53. Suspicious TLD
    suspicious_tlds = ['.top', '.gq', '.tk', '.ml', '.cf']
    features['suspecious_tld'] = int(any(tld in domain.lower() for tld in suspicious_tlds))
    
    # 54. Statistical report
    # features['statistical_report'] = 0  # Placeholder (requires external data)
    
    # # 55. Number of hyperlinks
    # features['nb_hyperlinks'] = 0  # Placeholder (requires HTML content)
    
    # # 56. Ratio of internal hyperlinks
    # features['ratio_intHyperlinks'] = 0  # Placeholder (requires HTML content)
    
    # # 57. Ratio of external hyperlinks
    # features['ratio_extHyperlinks'] = 0  # Placeholder (requires HTML content)
    
    # # 58. Ratio of null hyperlinks
    # features['ratio_nullHyperlinks'] = 0  # Placeholder (requires HTML content)
    
    # # 59. Ratio of internal media
    # features['ratio_intMedia'] = 0  # Placeholder (requires HTML content)
    
    # # 60. Ratio of external media
    # features['ratio_extMedia'] = 0  # Placeholder (requires HTML content)
    
    # 61. Ratio of null media
    # features['ratio_nullMedia'] = 0  # Placeholder (requires HTML content)
    
    # # 62. Ratio of internal errors
    # features['ratio_intErrors'] = 0  # Placeholder (requires HTML content)
    
    # # 63. Ratio of external errors
    # features['ratio_extErrors'] = 0  # Placeholder (requires HTML content)
    
    # # 64. Login form presence
    # features['login_form'] = 0  # Placeholder (requires HTML content)
    
    # # 65. External favicon
    # features['external_favicon'] = 0  # Placeholder (requires HTML content)
    
    # # 66. Links in tags
    # features['links_in_tags'] = 0  # Placeholder (requires HTML content)
    
    # # 67. Email in submit forms
    # features['submit_email'] = 0  # Placeholder (requires HTML content)
    
    # # 68. Ratio of internal media
    # features['ratio_intMedia'] = 0  # Placeholder (requires HTML content)
    
    # 69. Ratio of external media
    # features['ratio_extMedia'] = 0  # Placeholder (requires HTML content)
    
    # # 70. Server Form Handler (SFH)
    # features['sfh'] = 0  # Placeholder (requires HTML content)
    
    # # 71. Presence of iframe
    # features['iframe'] = 0  # Placeholder (requires HTML content)
    
    # # 72. Popup windows
    # features['popup_window'] = 0  # Placeholder (requires HTML content)
    
    # # 73. Safe anchor
    # features['safe_anchor'] = 0  # Placeholder (requires HTML content)
    
    # # 74. OnMouseOver in HTML
    # features['onmouseover'] = 0  # Placeholder (requires HTML content)
    
    # # 75. Right-click disabled
    # features['right_clic'] = 0  # Placeholder (requires HTML content)
    
    # # 76. Empty title in HTML
    # features['empty_title'] = 0  # Placeholder (requires HTML content)
    
    # # 77. Domain in title
    # features['domain_in_title'] = 0  # Placeholder (requires HTML content)
    
    # # 78. Domain with copyright symbol
    # features['domain_with_copyright'] = 0  # Placeholder (requires HTML content)
    
    # 79. Whois registered domain
    try:
        whois_info = whois.whois(domain)
        features['whois_registered_domain'] = whois_info.domain_name
    except:
        features['whois_registered_domain'] = None
    
    # 80. Domain registration length
    try:
        creation_date = whois_info.creation_date
        expiration_date = whois_info.expiration_date
        features['domain_registration_length'] = (expiration_date - creation_date).days if expiration_date and creation_date else 0
    except:
        features['domain_registration_length'] = None
    
    # 81. Domain age
    try:
        age = (datetime.now() - creation_date).days if creation_date else None
        features['domain_age'] = age
    except:
        features['domain_age'] = None
    
    # 82. Web traffic rank
    # features['web_traffic'] = 0  # Placeholder (requires external data)
    
    # 83. DNS record
    try:
        features['dns_record'] = 1 if socket.gethostbyname(domain) else None
    except:
        features['dns_record'] = None
    
    # 84. Google index
    # features['google_index'] = 0  # Placeholder (requires external data)
    
    # # 85. Page rank
    # features['page_rank'] = 0  # Placeholder (requires external data)
    
    # 86. Status (Target variable)
     # Placeholder (requires labeled data)
    
    return features



In [30]:
import pandas as pd

def predict_url_status(url, rf_model, feature_columns):
    """
    Predict the status of a URL using the provided model and encoder.

    Args:
        url (str): The URL to analyze.
        model (sklearn model): The trained model for prediction.
        encoder (sklearn encoder): The label encoder to decode the prediction.
        feature_columns (list): List of feature column names in the correct order.

    Returns:
        str: The predicted status of the URL (e.g., 'phishing' or 'benign').
    """
    # Extract features from the URL
    url_features = extract_features(url)

    # Convert the feature dictionary to a DataFrame
    url_features_df = pd.DataFrame([url_features])

    # Handle missing features by adding them with default values
    missing_cols = set(feature_columns) - set(url_features_df.columns)
    for col in missing_cols:
        url_features_df[col] = None  # Or use an appropriate default value

    # Reorder the DataFrame columns to match the model's expected input
    url_features_df = url_features_df[feature_columns]

    # Ensure the DataFrame is of the correct type for the model
    url_features_df = url_features_df.astype(float)

    # Make a prediction
    prediction = rf_model.predict(url_features_df)

    # Decode the predicted status
    

    return prediction

# Example usage (assuming model, encoder, and feature_columns are predefined):
# url = "http://www.example.com"
# status = predict_url_status(url, model, encoder, feature_columns)
# print(status)


In [31]:
url = "https://libgenesis.net/"
feature_columns = ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
       'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
       'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
       'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
       'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
       'empty_title', 'domain_in_title', 'domain_with_copyright',
       'whois_registered_domain', 'domain_registration_length', 'domain_age',
       'web_traffic', 'dns_record', 'google_index', 'page_rank']
predicted_status = predict_url_status(url, rf_model,feature_columns)
print(f"The URL '{url}' is predicted to be '{predicted_status}'.")

The URL 'https://libgenesis.net/' is predicted to be '[1]'.


In [32]:
import joblib

# Assuming your model is named `rf_model`
model_save_path = 'phishing_model.pkl'
joblib.dump(rf_model, model_save_path)


['phishing_model.pkl']