# **PHISHING SITE URL DETECTION FINAL RESULT**

**Mounting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing Libraries**

In [None]:
import pickle
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sklearn
import numpy as np
import urllib.parse
import re
import string

**Loading models from Pickle File**

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/PHYSHING SITE PROJECT/FINALL_PHISHING.pkl', 'rb') as f:
    loaded_objects = pickle.load(f)

RF_model_num = loaded_objects['RF_model_num']
LR_model_text = loaded_objects['LR_model_text']
vectorizer = loaded_objects['vectorizer']
scaler = loaded_objects['scaler']

**Feature Extraction and Preprocessing of given input**

In [None]:
def extract_url_features(url):
    # Parsing the URL
    parsed_url = urllib.parse.urlparse(url)

    # Extracting URL features
    url_length = len(url)
    domain = parsed_url.netloc
    domain_length = len(domain)
    tld = domain.split('.')[-1]
    tld_length = len(tld)
    subdomains = parsed_url.hostname.split('.')

    # Removing www
    if subdomains[0] == 'www':
        subdomains.pop(0)

    if len(subdomains) > 1:
        tld = subdomains.pop(-1)  # Removing TLD if it exists

    # Calculating the number of subdomains
    no_of_subdomains = len(subdomains)
    path = parsed_url.path

    # Counting the number of letters and digits in the URL
    letters_in_url = sum(c.isalpha() for c in url)
    digits_in_url = sum(c.isdigit() for c in url)

    # Counting special characters
    equals_count = url.count('=')
    qmark_count = url.count('?')
    ampersand_count = url.count('&')
    other_special_chars_count = len(re.findall(r'[^a-zA-Z0-9?&=./]', url))

    # calculationg ratios
    letter_ratio = letters_in_url / url_length
    digit_ratio = digits_in_url / url_length
    special_char_ratio = other_special_chars_count / url_length

    # Checking isHTTPS
    if parsed_url.scheme == 'https':
        is_https = 1
    else:
        is_https = 0

    # Returning extracted features
    return {
        'URLLength': url_length,
        'DomainLength': domain_length,
        'TLDLength': tld_length,
        'NoOfSubDomain': no_of_subdomains,
        'NoOfLettersInURL': letters_in_url,
        'LetterRatioInURL': letter_ratio,
        'NoOfDegitsInURL': digits_in_url,
        'DegitRatioInURL': digit_ratio,
        'NoOfEqualsInURL': equals_count,
        'NoOfQMarkInURL': qmark_count,
        'NoOfAmpersandInURL': ampersand_count,
        'NoOfOtherSpecialCharsInURL': other_special_chars_count,
        'SpacialCharRatioInURL': special_char_ratio,
        'IsHTTPS': is_https
    }

def split_by_punc(url):
    url_parts = re.split(r'([{}])'.format(re.escape(string.punctuation)), url)
    url_joined = ' '.join(url_parts)
    return url_joined

def remove_punctuations(url):
    clean_url = ''.join(char for char in url if char not in string.punctuation)
    return clean_url

**Function For Prediction**

In [None]:
def prediction_and_probability(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all <a> tags with 'href' attribute pointing to another webpage
        external_refs = soup.find_all('a', href=True)
        # Count the number of external references
        num_external_refs = len(external_refs)

        # Extracting numerical features from the URL
        numerical_features = extract_url_features(url)
        numerical_features_df = pd.DataFrame([numerical_features])
        numerical_features_df['NoOfExternalRef'] = num_external_refs
        numerical_features_scaled = scaler.transform(numerical_features_df)

        # Extracting text features from the URL
        clean_url = split_by_punc(url)
        clean_url = remove_punctuations(clean_url).lower()
        clean_url = clean_url.replace('www', '')
        text_features = vectorizer.transform([clean_url])

        # probability predictions using both models
        proba_num = RF_model_num.predict_proba(numerical_features_scaled)[0, 1]  # Probability of positive class
        proba_text = LR_model_text.predict_proba(text_features)[0, 1]  # Probability of positive class

        # Combining the probabilities using averaging
        avg_proba = (proba_num + proba_text) / 2
        return avg_proba
    except Exception as e:
        print(f"Error predicting probability: {e}")
        return None

# **PHISHING SITE URL PREDICTION**

In [None]:
print("\n\n---------------------------------------- PHISHING SITE DETECION ----------------------------------------")
print()

# Input field for entering the URL
url = input('Enter the URL of the website:')

print()

print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n")

probability = prediction_and_probability(url)

if probability >= 0.4:
    print(f"The URL is predicted to be GOOD \n(probability: {probability:.2f})")
else:
    print(f"The URL is predicted to be PHISHING \n(probability: {probability:.2f})")

print("\n\n\n")



---------------------------------------- PHISHING SITE DETECION ----------------------------------------

Enter the URL of the website:https://meet.google.com

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


The URL is predicted to be GOOD 
(probability: 0.48)




