In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd 


# Use a custom cache directory
extractor = tldextract.TLDExtract(cache_dir='/path/to/writable/cache_dir')
extractor = tldextract.TLDExtract(cache_dir=False)

def analyze_url(url):
    features = {}
    
   
    
  
    # 3. URLLength
    features['URLLength'] = len(url)
    

    
    # 5. DomainLength
    #features['DomainLength'] = len(features['Domain'])
    protocol = urlparse(url).scheme  # Get the protocol (http or https)
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + features['Domain']
    features['DomainLength'] = sum(c.isalpha() for c in full_domain)

    # 6. IsDomainIP
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0
    
    # 7. TLD
    features['TLD'] = extracted.suffix
    
    # 8. URLSimilarityIndex (Placeholder for now)
    features['URLSimilarityIndex'] = 0  # Implement actual logic for similarity score
    
    # 9. CharContinuationRate
    features['CharContinuationRate'] = calculate_continuation_rate(url)
    
    # 10. TLDLegitimateProb (Placeholder for now)
    features['TLDLegitimateProb'] = 0.9  # Example value, implement actual logic
    
    # 11. URLCharProb (Placeholder for now)
    features['URLCharProb'] = 0.9  # Example value, implement actual logic
    
    # 12. TLDLength
    features['TLDLength'] = len(features['TLD'])
    
    # 13. NoOfSubDomain
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    
    # 14. HasObfuscation and 15. NoOfObfuscatedChar
    obfuscation_chars = re.findall(r'%|\.|_|-|@|%20', url)
    features['HasObfuscation'] = 1 if obfuscation_chars else 0
    features['NoOfObfuscatedChar'] = len(''.join(obfuscation_chars))
    
    # 16. ObfuscationRatio
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 17. NoOfLettersInURL
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    
    # 18. LetterRatioInURL
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 19. NoOfDigitsInURL
    features['NoOfDigitsInURL'] = sum(c.isdigit() for c in url)
    
    # 20. DigitRatioInURL
    features['DigitRatioInURL'] = features['NoOfDigitsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 21. NoOfEqualsInURL
    features['NoOfEqualsInURL'] = url.count('=')
    
    # 22. NoOfQMarkInURL
    features['NoOfQMarkInURL'] = url.count('?')
    
    # 23. NoOfAmpersandInURL
    features['NoOfAmpersandInURL'] = url.count('&')
    
    # 24. NoOfOtherSpecialCharsInURL
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    
    # 25. SpecialCharRatioInURL
    features['SpecialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 26. IsHTTPS
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0
    
    # 27. LineOfCode, 28. LargestLineLength, and other website features
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 27. LineOfCode (Approximation)
        features['LineOfCode'] = len(soup.prettify().splitlines())
        
        # 28. LargestLineLength
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        # 29. HasTitle
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        
        # 30. Title
        features['Title'] = title_tag.text if title_tag else ''
        
        # 31. DomainTitleMatchScore (Placeholder)
        features['DomainTitleMatchScore'] = 0  # Implement actual logic
        
        # 32. URLTitleMatchScore (Placeholder)
        features['URLTitleMatchScore'] = 0  # Implement actual logic
        
        # 33. HasFavicon
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        
        # 34. Robots
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        
        # 35. IsResponsive (Simple check)
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        
        # 36. NoOfURLRedirect
        features['NoOfURLRedirect'] = len(response.history)
        
        # 37. NoOfSelfRedirect (Not easy to implement without actual logging)
        features['NoOfSelfRedirect'] = 0  # Implement actual logic
        
        # 38. HasDescription
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        
        # 39. NoOfPopup and 40. NoOfiFrame (Approximation)
        features['NoOfPopup'] = 0  # Implement actual logic
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        
        # 41. HasExternalFormSubmit
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        
        # 42. HasSocialNet
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        
        # 43. HasSubmitButton
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        
        # 44. HasHiddenFields
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        
        # 45. HasPasswordField
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        
        # 46. Bank, 47. Pay, 48. Crypto (Placeholder)
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        
        # 49. HasCopyrightInfo
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        
        # 50. NoOfImage
        features['NoOfImage'] = len(soup.find_all('img'))
        
        # 51. NoOfCSS
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        
        # 52. NoOfJS
        features['NoOfJS'] = len(soup.find_all('script'))
        
        # 53. NoOfSelfRef (Approximation)
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        
        # 54. NoOfEmptyRef
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))
        
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    df = pd.DataFrame([features])  # Creating DataFrame from features dictionary
    return df

def calculate_continuation_rate(url):
    """
    Calculate the rate of character continuation in the URL.
    This is a simplified version and can be improved.
    """
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

def clean_url(url):
    # Remove any unwanted characters like tab
    return url.strip()



# Example usage
url1 = input("Enter a URL to analyze: ")
url = clean_url(url1)
features_df = analyze_url(url)
features_df



Error fetching https://www.southbankmosaics.com: argument of type 'NoneType' is not iterable


Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit
0,www.southbankmosaics.com,https://www.southbankmosaics.com,32,southbankmosaics,24,0,com,0,0.129032,0.9,...,0,0,1,1,0,0,0,0,0,1


In [2]:
features_df.values

array([['www.southbankmosaics.com', 'https://www.southbankmosaics.com',
        32, 'southbankmosaics', 24, 0, 'com', 0, 0.12903225806451613,
        0.9, 0.9, 3, 1, 1, 2, 0.0625, 27, 0.84375, 0, 0.0, 0, 0, 0, 2,
        0.0625, 1, 1282, 9467, 1,
        'ข่าวสด ข่าววันนี้ ข่าวกีฬา ข่าวบันเทิง อัพเดทสดใหม่ทุกวัน – ข่าวสด ข่าวกีฬา ข่าวบันเทิง ข่าววันนี้ อัปเดตข่าวสารรวดเร็วทันใจ พร้อมรับชมสาระน่ารู้ต่างๆ ได้ฟรีตลอด 24ชั่วโมง',
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1]], dtype=object)

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Use a custom cache directory for TLD extraction
extractor = tldextract.TLDExtract(cache_dir='/path/to/writable/cache_dir')
extractor = tldextract.TLDExtract(cache_dir=False)

# Example legitimate TLDs with associated probabilities
tld_legitimacy = {
    'com': 0.95, 'org': 0.9, 'net': 0.85, 'edu': 0.9, 'gov': 0.98,
    'xyz': 0.5, 'info': 0.7, 'biz': 0.6  # Extend this dictionary as needed
}

def analyze_url(url):
    features = {}

    # Extract domain information
    extracted = extractor(url)
    features['Domain'] = extracted.domain
    features['TLD'] = extracted.suffix

    # URL Length
    features['URLLength'] = len(url)

    # Domain Length (sum of alphabetic characters in domain)
    protocol = urlparse(url).scheme
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + features['Domain']
    features['DomainLength'] = sum(c.isalpha() for c in full_domain)

    # Check if Domain is an IP address
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0

    # URL Similarity Index using TF-IDF and cosine similarity
    known_legit_urls = ["https://www.google.com", "https://www.facebook.com"]
    url_similarity_scores = []
    vectorizer = TfidfVectorizer().fit_transform([url] + known_legit_urls)
    similarity_matrix = cosine_similarity(vectorizer[0:1], vectorizer)
    features['URLSimilarityIndex'] = np.mean(similarity_matrix[0][1:])

    # Char Continuation Rate
    features['CharContinuationRate'] = calculate_continuation_rate(url)

    # TLD Legitimacy Probability
    features['TLDLegitimateProb'] = tld_legitimacy.get(features['TLD'], 0.5)

    # URL Character Probability (based on ratio of alphanumeric to special characters)
    special_characters = sum(1 for char in url if not char.isalnum())
    features['URLCharProb'] = 1 - (special_characters / len(url)) if len(url) > 0 else 0

    # TLD Length
    features['TLDLength'] = len(features['TLD'])

    # Number of Subdomains
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0

    # Obfuscation Checks
    obfuscation_chars = re.findall(r'%|\.|_|-|@|%20', url)
    features['HasObfuscation'] = 1 if obfuscation_chars else 0
    features['NoOfObfuscatedChar'] = len(''.join(obfuscation_chars))
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / features['URLLength'] if features['URLLength'] > 0 else 0

    # Additional URL character ratios
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    features['NoOfDigitsInURL'] = sum(c.isdigit() for c in url)
    features['DigitRatioInURL'] = features['NoOfDigitsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    features['NoOfEqualsInURL'] = url.count('=')
    features['NoOfQMarkInURL'] = url.count('?')
    features['NoOfAmpersandInURL'] = url.count('&')
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    features['SpecialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0

    # Line of Code, Largest Line Length, and other features based on website content
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Approximate number of lines of code
        features['LineOfCode'] = len(soup.prettify().splitlines())
        
        # Largest Line Length
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        # Title-based features
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        features['Title'] = title_tag.text if title_tag else ''
        
        # Domain and URL Title Match Scores
        features['DomainTitleMatchScore'] = fuzz.ratio(features['Domain'], features['Title']) if features['Title'] else 0
        features['URLTitleMatchScore'] = fuzz.ratio(url, features['Title']) if features['Title'] else 0

        # Additional features based on webpage structure
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        features['NoOfURLRedirect'] = len(response.history)
        features['NoOfSelfRedirect'] = 0  # Self-redirect logic would require advanced logging
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        features['NoOfPopup'] = 0  # Popup detection would require browser-based interaction
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        features['NoOfJS'] = len(soup.find_all('script'))
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))

    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    # Convert features to DataFrame
    df = pd.DataFrame([features])
    return df

def calculate_continuation_rate(url):
    # Rate of character continuation
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

# Clean URL input
def clean_url(url):
    return url.strip()

# Example usage
url1 = input("Enter a URL to analyze: ")
url = clean_url(url1)
features_df = analyze_url(url)
print(features_df)




Error fetching https://www.southbankmosaics.com: argument of type 'NoneType' is not iterable
             Domain  TLD  URLLength  DomainLength  IsDomainIP  \
0  southbankmosaics  com         32            24           0   

   URLSimilarityIndex  CharContinuationRate  TLDLegitimateProb  URLCharProb  \
0            0.511357              0.129032               0.95      0.84375   

   TLDLength  ...  URLTitleMatchScore  HasFavicon  Robots  IsResponsive  \
0          3  ...                   0           0       1             1   

   NoOfURLRedirect  NoOfSelfRedirect  HasDescription  NoOfPopup  NoOfiFrame  \
0                0                 0               0          0           0   

   HasExternalFormSubmit  
0                      1  

[1 rows x 39 columns]


In [6]:
pip install fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeable
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use a custom cache directory for tldextract if needed
extractor = tldextract.TLDExtract(cache_dir=False)

def analyze_url(url):
    features = {}
    
    # Parse URL components
    extracted = extractor(url)
    features['Domain'] = extracted.domain
    
    # URL Length
    features['URLLength'] = len(url)
    
    # Domain Length
    protocol = urlparse(url).scheme  # Get the protocol (http or https)
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + features['Domain']
    features['DomainLength'] = sum(c.isalpha() for c in full_domain)

    # IsDomainIP
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0
    
    # TLD
    features['TLD'] = extracted.suffix
    
    # URL Similarity Index
    features['URLSimilarityIndex'] = calculate_similarity_index(url)
    
    # Char Continuation Rate
    features['CharContinuationRate'] = calculate_continuation_rate(url)
    
    # TLD Legitimate Probability (Placeholder)
    features['TLDLegitimateProb'] = 0.9  # Example value
    
    # URL Character Probability (Placeholder)
    features['URLCharProb'] = 0.9  # Example value
    
    # TLD Length
    features['TLDLength'] = len(features['TLD'])
    
    # Number of Subdomains
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    
    # Obfuscation Characteristics
    obfuscation_chars = re.findall(r'%|\.|_|-|@|%20', url)
    features['HasObfuscation'] = 1 if obfuscation_chars else 0
    features['NoOfObfuscatedChar'] = len(''.join(obfuscation_chars))
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # URL Composition
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    features['NoOfDigitsInURL'] = sum(c.isdigit() for c in url)
    features['DigitRatioInURL'] = features['NoOfDigitsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    features['NoOfEqualsInURL'] = url.count('=')
    features['NoOfQMarkInURL'] = url.count('?')
    features['NoOfAmpersandInURL'] = url.count('&')
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    features['SpecialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # Protocol Check
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0
    
    # Fetch webpage content
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Approximate line of code
        features['LineOfCode'] = len(soup.prettify().splitlines())
        
        # Largest line length
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        # Title check
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        features['Title'] = title_tag.text if title_tag else ''
        
        # Domain Title Match Score
        features['DomainTitleMatchScore'] = calculate_domain_title_match_score(features['Title'], features['Domain'])
        
        # URL Title Match Score
        url_path = urlparse(url).path
        features['URLTitleMatchScore'] = calculate_url_title_match_score(features['Title'], url_path)
        
        # Other webpage attributes
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        features['NoOfURLRedirect'] = len(response.history)
        
        # Meta tags and features
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        features['NoOfPopup'] = count_popups(soup)
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        features['NoOfJS'] = len(soup.find_all('script'))
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))
        
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    return pd.DataFrame([features])

def calculate_continuation_rate(url):
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

def calculate_similarity_index(url, keywords=["bank", "login", "secure", "account"]):
    vectorizer = TfidfVectorizer().fit_transform([url] + keywords)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)
    return max(similarity_matrix[0][1:])

def calculate_domain_title_match_score(title, domain):
    if not title or not domain:
        return 0
    vectorizer = TfidfVectorizer().fit_transform([title, domain])
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix[0][1]

def calculate_url_title_match_score(title, url_path):
    if not title or not url_path:
        return 0
    vectorizer = TfidfVectorizer().fit_transform([title, url_path])
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix[0][1]

def count_popups(soup):
    popup_classes = ['modal', 'popup', 'overlay']
    popups = [soup.find_all(class_=cls) for cls in popup_classes]
    return sum(len(p) for p in popups)

def clean_url(url):
    return url.strip()

# Example usage
url1 = input("Enter a URL to analyze: ")
url = clean_url(url1)
features_df = analyze_url(url)
print(features_df)


      Domain  URLLength  DomainLength  IsDomainIP TLD  URLSimilarityIndex  \
0  uni-mainz         24            16           0  de                 0.0   

   CharContinuationRate  TLDLegitimateProb  URLCharProb  TLDLength  ...  \
0              0.173913                0.9          0.9          2  ...   

   HasPasswordField  Bank  Pay  Crypto  HasCopyrightInfo  NoOfImage  NoOfCSS  \
0                 0     0    0       0                 0          2       18   

   NoOfJS  NoOfSelfRef  NoOfEmptyRef  
0      76            0             0  

[1 rows x 51 columns]


  features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize tldextract for domain extraction
extractor = tldextract.TLDExtract(cache_dir=False)

def analyze_url(url):
    features = {}
    
    # Parse URL components
    extracted = extractor(url)
    protocol = urlparse(url).scheme  # Get the protocol (http or https)
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + extracted.domain
    
    # 1. Domain Length (excluding protocol)
    features['DomainLength'] = len(extracted.domain)
    
    # 2. Is Domain an IP Address?
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0
    
    # 3. URL Similarity Index (Cosine similarity with risky keywords)
    features['URLSimilarityIndex'] = calculate_similarity_index(url)
    
    # 4. Char Continuation Rate (rate of repeated characters in URL)
    features['CharContinuationRate'] = calculate_continuation_rate(url)
    
    # 5. TLD Legitimate Probability (this could be complex, so it's a placeholder)
    features['TLDLegitimateProb'] = 0.9  # Placeholder example value
    
    # 6. URL Character Probability (basic probability check using character frequencies)
    features['URLCharProb'] = calculate_char_probability(url)
    
    # 7. TLD Length (length of the TLD part)
    features['TLDLength'] = len(extracted.suffix)
    
    # 8. Number of Subdomains
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    
    # 9. Obfuscation: Check if URL has obfuscation patterns (%20, _, -, etc.)
    features['HasObfuscation'] = 1 if has_obfuscation(url) else 0
    features['NoOfObfuscatedChar'] = len(re.findall(r'%|\.|_|-|@|%20', url))
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / len(url) if len(url) > 0 else 0
    
    # 10. Letters in URL (only letters)
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / len(url) if len(url) > 0 else 0
    
    # 11. Digits in URL (only digits)
    features['NoOfDegitsInURL'] = sum(c.isdigit() for c in url)
    features['DegitRatioInURL'] = features['NoOfDegitsInURL'] / len(url) if len(url) > 0 else 0
    
    # 12. Count specific symbols in URL
    features['NoOfEqualsInURL'] = url.count('=')
    features['NoOfQMarkInURL'] = url.count('?')
    features['NoOfAmpersandInURL'] = url.count('&')
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    features['SpacialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / len(url) if len(url) > 0 else 0
    
    # 13. HTTPS check
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0
    
    # Webpage-specific features (scraping)
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        features['LineOfCode'] = len(soup.prettify().splitlines())
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        features['DomainTitleMatchScore'] = calculate_domain_title_match_score(title_tag.text if title_tag else '', extracted.domain)
        url_path = urlparse(url).path
        features['URLTitleMatchScore'] = calculate_url_title_match_score(title_tag.text if title_tag else '', url_path)
        
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        features['NoOfURLRedirect'] = len(response.history)
        features['NoOfSelfRedirect'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        features['NoOfPopup'] = count_popups(soup)
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        features['NoOfImage'] = len(soup.find_all('img'))
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        features['NoOfJS'] = len(soup.find_all('script'))
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))
        features['NoOfExternalRef'] = len(soup.find_all('a', href=lambda x: x and not x.startswith(url) and not x.startswith('/')))
        
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    return pd.DataFrame([features])

# Helper functions
def calculate_continuation_rate(url):
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

def calculate_similarity_index(url, keywords=["bank", "login", "secure", "account"]):
    vectorizer = TfidfVectorizer().fit_transform([url] + keywords)
    vectors = vectorizer.toarray()
    similarity_matrix = cosine_similarity(vectors)
    return max(similarity_matrix[0][1:])

def calculate_char_probability(url):
    char_counts = {char: url.count(char) for char in set(url)}
    total_chars = len(url)
    probabilities = {char: count / total_chars for char, count in char_counts.items()}
    return sum(probabilities.values()) / len(probabilities) if probabilities else 0

def has_obfuscation(url):
    return bool(re.search(r'%[0-9A-F]{2}|_|-|@|%20', url))

def calculate_domain_title_match_score(title, domain):
    title_words = set(title.lower().split())
    domain_words = set(domain.lower().split('.'))
    return len(title_words & domain_words) / len(title_words) if title_words else 0

def calculate_url_title_match_score(title, path):
    path_words = set(path.lower().split('/'))
    title_words = set(title.lower().split())
    return len(path_words & title_words) / len(path_words) if path_words else 0

def count_popups(soup):
    # Custom function to count possible popups
    return len(soup.find_all('script', src=lambda x: x and 'popup' in x))

# Test with URL
url1 = input("Enter a URL to analyze: ")

features_df = analyze_url(url1)
features_df


  features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0


Unnamed: 0,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,...,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef
0,12,0,0.0,0.148148,0.9,0.058824,2,1,0,2,...,1,0,0,0,0,1,0,0,0,0


In [11]:
features_df.columns

Index(['DomainLength', 'IsDomainIP', 'URLSimilarityIndex',
       'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength',
       'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar',
       'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL',
       'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL',
       'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon',
       'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect',
       'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit',
       'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields',
       'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo',
       'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef',
       'NoOfExternalRef'],
      dtype='object')

In [12]:
a="hdfcbank"
'bank' in a

True