In [18]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize tldextract for domain extraction
extractor = tldextract.TLDExtract(cache_dir=False)


def analyze_url(url):
    features = {}
    
    # Parse URL components
    extracted = extractor(url)
    protocol = urlparse(url).scheme  # Get the protocol (http or https)
    print(protocol)
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + extracted.domain
    
    # 1. Domain Length (excluding protocol)
    features['DomainLength'] = len(('.' + extracted.subdomain if extracted.subdomain else '') + '.' + extracted.domain)
    
    # 2. Is Domain an IP Address?
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0
    
    # 3. URL Similarity Index (Cosine similarity with risky keywords)
    features['URLSimilarityIndex'] = calculate_similarity_index(url)
    
    # 4. Char Continuation Rate (rate of repeated characters in URL)
    features['CharContinuationRate'] = calculate_continuation_rate(url)
    
    # 5. TLD Legitimate Probability (this could be complex, so it's a placeholder)
    features['TLDLegitimateProb'] = 0.9  # Placeholder example value
    
    # 6. URL Character Probability (basic probability check using character frequencies)
    features['URLCharProb'] = calculate_char_probability(url)
    
    # 7. TLD Length (length of the TLD part)
    features['TLDLength'] = len(extracted.suffix)
    
    # 8. Number of Subdomains
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    
    # 9. Obfuscation: Check if URL has obfuscation patterns (%20, _, -, etc.)
    features['HasObfuscation'] = 1 if has_obfuscation(url) else 0
    features['NoOfObfuscatedChar'] = len(re.findall(r'%|\.|_|-|@|%20', url))
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / len(url) if len(url) > 0 else 0
    
    # 10. Letters in URL (only letters)
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / len(url) if len(url) > 0 else 0
    
    # 11. Digits in URL (only digits)
    features['NoOfDegitsInURL'] = sum(c.isdigit() for c in url)
    features['DegitRatioInURL'] = features['NoOfDegitsInURL'] / len(url) if len(url) > 0 else 0
    
    # 12. Count specific symbols in URL
    features['NoOfEqualsInURL'] = url.count('=')
    features['NoOfQMarkInURL'] = url.count('?')
    features['NoOfAmpersandInURL'] = url.count('&')
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    features['SpacialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / len(url) if len(url) > 0 else 0
    
    # 13. HTTPS check
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0
    
    # Webpage-specific features (scraping)
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        features['LineOfCode'] = len(soup.prettify().splitlines())
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        features['DomainTitleMatchScore'] = calculate_domain_title_match_score(title_tag.text if title_tag else '', extracted.domain)
        url_path = urlparse(url).path
        features['URLTitleMatchScore'] = calculate_url_title_match_score(title_tag.text if title_tag else '', url_path)
        
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        features['NoOfURLRedirect'] = len(response.history)
        features['NoOfSelfRedirect'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        features['NoOfPopup'] = count_popups(soup)
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        features['NoOfJS'] = len(soup.find_all('script'))
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))
        features['NoOfExternalRef'] = len(soup.find_all('a', href=lambda x: x and not x.startswith(url) and not x.startswith('/')))
        
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    return pd.DataFrame([features])

# Helper functions
def calculate_continuation_rate(url):
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

def calculate_similarity_index(url, keywords=["bank", "login", "secure", "account"]):
    vectorizer = TfidfVectorizer().fit_transform([url] + keywords)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)
    return max(similarity_matrix[0][1:])

def calculate_char_probability(url):
    char_counts = {char: url.count(char) for char in set(url)}
    total_chars = len(url)
    probabilities = {char: count / total_chars for char, count in char_counts.items()}
    return sum(probabilities.values()) / len(probabilities) if probabilities else 0

def has_obfuscation(url):
    return bool(re.search(r'%[0-9A-F]{2}|_|-|@|%20', url))

def calculate_domain_title_match_score(title, domain):
    title_words = set(title.lower().split())
    domain_words = set(domain.lower().split('.'))
    return len(title_words & domain_words) / len(title_words) if title_words else 0

def calculate_url_title_match_score(title, path):
    path_words = set(path.lower().split('/'))
    title_words = set(title.lower().split())
    return len(path_words & title_words) / len(path_words) if path_words else 0

def count_popups(soup):
    # Custom function to count possible popups
    return len(soup.find_all('script', src=lambda x: x and 'popup' in x))

"""# Test with URL
url1 = str(input("Enter a URL to analyze: "))"""

features_df = analyze_url('https://www.google.com')
features_df


https


  features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0


Unnamed: 0,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,...,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef
0,11,0,0.0,0.238095,0.9,0.071429,3,1,0,2,...,0,0,0,0,1,0,8,11,0,10


In [3]:
import joblib


In [4]:
outlier_removal_params = joblib.load(r"C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/data_outliers.joblib")
DL = outlier_removal_params['outlier_info']
outlier_df = pd.DataFrame.from_dict(DL, orient='index')


# 3 Loading the encoder 
#one_hot_encoder = joblib.load(r'C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/encoder.joblib')

# 4 Loading the Scaler joblib file
scaler = joblib.load(r'C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/scaler.joblib')

# 5 loading the pca transformer 
pca = joblib.load(r'C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/pca.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
agg1 = joblib.load('C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/fcm_model.joblib')
agg1
import numpy as np

In [7]:
datatypes_df = joblib.load('C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/data_datatypes.joblib')
scaler 

In [8]:
fcm = joblib.load('C:/Users/Sejal Hanmante/OneDrive/Documents/GitHub/Phishing-url-detection/Clustering/fcm_model.joblib')
fcm

{'centroids': array([[-1.88922731e+00,  2.95877181e-01,  1.84454334e-01,
         -2.73474384e-02, -2.03109839e-01, -5.46176474e-03,
         -2.35085866e-02, -1.81515053e-02,  6.22405229e-02,
          4.94318551e-02, -1.31540559e-02, -2.94482407e-02,
         -1.05884506e-02, -4.52373327e-03,  4.44485476e-02,
         -4.41707565e-02,  5.86579259e-02,  1.44402534e-02,
          4.62873981e-02, -1.43745926e-02, -2.86353990e-02,
         -1.30159060e-02, -3.08523436e-02,  7.33719533e-03,
          1.52042366e-02, -1.51482826e-02, -2.31405568e-02,
         -3.84532376e-02,  2.55524218e-03,  2.98594072e-02,
         -1.07387859e-02, -4.06579236e-03,  1.31860060e-02,
          4.53010141e-03,  4.79213744e-03, -1.62589103e-02,
         -1.04830181e-02,  2.42189477e-04, -2.48115015e-03,
         -1.35527181e-02,  1.51919827e-03, -9.32956574e-04],
        [ 2.23164619e+00, -4.05111197e-01, -1.86717996e-01,
          8.14482558e-02,  1.92848481e-01,  2.19262857e-02,
          7.35179828e-02, 

In [9]:
centroids = fcm['centroids']
m_value = fcm['m_value']
n_clusters = fcm['n_clusters']
error_tolerance = fcm['error_tolerance']
iterations = fcm['max_iterations']
import skfuzzy as fuzz

In [15]:
"""for i in outlier_df.index:
    q1 = outlier_df.loc[i,'Q1']
    print(i,q1)
    features_df[i] = features_df[i].apply(lambda x : if x <q1)
    features_df[i] = np.where(features_df[i]>ub , ub , np.where(features_df[i]<lb,lb,features_df[i]))"""

"for i in outlier_df.index:\n    q1 = outlier_df.loc[i,'Q1']\n    print(i,q1)\n    features_df[i] = features_df[i].apply(lambda x : if x <q1)\n    features_df[i] = np.where(features_df[i]>ub , ub , np.where(features_df[i]<lb,lb,features_df[i]))"

In [28]:
def predict(url):
    features_df = analyze_url(url)

    for feature in outlier_df.index:
        lb = outlier_df.loc[feature, 'LB']
        ub = outlier_df.loc[feature, 'UB']
    
    # Apply the logic to remove outliers for each feature
        features_df[feature] = np.where(features_df[feature]>ub , ub , np.where(features_df[feature]<lb,lb,features_df[feature]))
    
    # Converting columns to datatypes as present in original dataset
    for i in features_df.columns :
        j=features_df[i].dtype
        if j != datatypes_df[i]:
        #print(i,j)
            features_df[i] = features_df[i].astype(datatypes_df[i])

    required_features = scaler.feature_names_in_
    features_df = features_df.reindex(columns=required_features, fill_value=0)
    
    # Scaling the features
    scaled_feats = scaler.transform(features_df)

    # PCA conversion 
    pca_df = pca.transform(scaled_feats).reshape(-1,1)

    

    # Cluster assignment 
    u, d, jm, p, fpc ,cntr= fuzz.cluster.cmeans_predict(
        pca_df, centroids, m_value, error=error_tolerance, maxiter=iterations,init=None)
    
    # Now, 'u' contains the membership matrix
    print("Membership matrix for the new data point:", u)

    # Assign cluster with the highest membership
    predicted_labels = np.argmax(u, axis=0)
   
    
    return predicted_labels
    
    

   


In [37]:
predict('http://www.kuradox92.lima-city.de')

http
Membership matrix for the new data point: [[0.48728199]
 [0.51271801]]


array([1], dtype=int64)

In [32]:
required_features = scaler.feature_names_in_
features_df = features_df.reindex(columns=required_features, fill_value=0)

In [None]:
import joblib 

In [40]:
k = ['DomainLength', 'IsDomainIP', 'URLSimilarityIndex',
       'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength',
       'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar',
       'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL',
       'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL',
       'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon',
       'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect',
       'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit',
       'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields',
       'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo',
       'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef',
       'NoOfExternalRef']

for i in features_df.columns:
    if i not in k:
        print(i)
    


In [None]:
datatypes_df

DomainLength                  float64
IsDomainIP                      int64
URLSimilarityIndex            float64
CharContinuationRate          float64
TLDLegitimateProb             float64
URLCharProb                   float64
TLDLength                     float64
NoOfSubDomain                 float64
HasObfuscation                  int64
NoOfObfuscatedChar            float64
ObfuscationRatio              float64
NoOfLettersInURL                int64
LetterRatioInURL              float64
NoOfDegitsInURL                 int64
DegitRatioInURL               float64
NoOfEqualsInURL                 int64
NoOfQMarkInURL                  int64
NoOfAmpersandInURL              int64
NoOfOtherSpecialCharsInURL      int64
SpacialCharRatioInURL         float64
IsHTTPS                         int64
LineOfCode                      int64
LargestLineLength               int64
HasTitle                        int64
DomainTitleMatchScore         float64
URLTitleMatchScore            float64
HasFavicon  

In [65]:
for i in features_df.columns :

    j=features_df[i].dtype
    if j != datatypes_df[i]:
        print(i,j)
        features_df[i] = features_df[i].astype(datatypes_df[i])



DomainLength int64
TLDLength int64
NoOfSubDomain int64
NoOfObfuscatedChar int64


In [64]:
features_df['DomainLength'].dtype

dtype('int64')

In [61]:
for i in datatypes_df:
    print(i)

float64
int64
float64
float64
float64
float64
float64
float64
int64
float64
float64
int64
float64
int64
float64
int64
int64
int64
int64
float64
int64
int64
int64
int64
float64
float64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64


In [None]:
check outputs 
