In [15]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
import re
import pandas as pd 


# Use a custom cache directory
extractor = tldextract.TLDExtract(cache_dir='/path/to/writable/cache_dir')
extractor = tldextract.TLDExtract(cache_dir=False)

def analyze_url(url):
    features = {}
    
    # 1. FILENAME
    features['FILENAME'] = url.split('//')[-1].split('/')[0]
    
    # 2. URL
    features['URL'] = url
    
    # 3. URLLength
    features['URLLength'] = len(url)
    
    # 4. Domain
    extracted = tldextract.extract(url)
    features['Domain'] = extracted.domain
    
    # 5. DomainLength
    #features['DomainLength'] = len(features['Domain'])
    protocol = urlparse(url).scheme  # Get the protocol (http or https)
    full_domain = protocol + ('.' + extracted.subdomain if extracted.subdomain else '') + '.' + features['Domain']
    features['DomainLength'] = sum(c.isalpha() for c in full_domain)

    # 6. IsDomainIP
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', extracted.domain) else 0
    
    # 7. TLD
    features['TLD'] = extracted.suffix
    
    # 8. URLSimilarityIndex (Placeholder for now)
    features['URLSimilarityIndex'] = 0  # Implement actual logic for similarity score
    
    # 9. CharContinuationRate
    features['CharContinuationRate'] = calculate_continuation_rate(url)
    
    # 10. TLDLegitimateProb (Placeholder for now)
    features['TLDLegitimateProb'] = 0.9  # Example value, implement actual logic
    
    # 11. URLCharProb (Placeholder for now)
    features['URLCharProb'] = 0.9  # Example value, implement actual logic
    
    # 12. TLDLength
    features['TLDLength'] = len(features['TLD'])
    
    # 13. NoOfSubDomain
    features['NoOfSubDomain'] = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    
    # 14. HasObfuscation and 15. NoOfObfuscatedChar
    obfuscation_chars = re.findall(r'%|\.|_|-|@|%20', url)
    features['HasObfuscation'] = 1 if obfuscation_chars else 0
    features['NoOfObfuscatedChar'] = len(''.join(obfuscation_chars))
    
    # 16. ObfuscationRatio
    features['ObfuscationRatio'] = features['NoOfObfuscatedChar'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 17. NoOfLettersInURL
    features['NoOfLettersInURL'] = sum(c.isalpha() for c in url)
    
    # 18. LetterRatioInURL
    features['LetterRatioInURL'] = features['NoOfLettersInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 19. NoOfDigitsInURL
    features['NoOfDigitsInURL'] = sum(c.isdigit() for c in url)
    
    # 20. DigitRatioInURL
    features['DigitRatioInURL'] = features['NoOfDigitsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 21. NoOfEqualsInURL
    features['NoOfEqualsInURL'] = url.count('=')
    
    # 22. NoOfQMarkInURL
    features['NoOfQMarkInURL'] = url.count('?')
    
    # 23. NoOfAmpersandInURL
    features['NoOfAmpersandInURL'] = url.count('&')
    
    # 24. NoOfOtherSpecialCharsInURL
    features['NoOfOtherSpecialCharsInURL'] = len(re.findall(r'[^a-zA-Z0-9:/?&=]', url))
    
    # 25. SpecialCharRatioInURL
    features['SpecialCharRatioInURL'] = features['NoOfOtherSpecialCharsInURL'] / features['URLLength'] if features['URLLength'] > 0 else 0
    
    # 26. IsHTTPS
    features['IsHTTPS'] = 1 if url.startswith('https://') else 0
    
    # 27. LineOfCode, 28. LargestLineLength, and other website features
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 27. LineOfCode (Approximation)
        features['LineOfCode'] = len(soup.prettify().splitlines())
        
        # 28. LargestLineLength
        features['LargestLineLength'] = max(len(line) for line in soup.prettify().splitlines())
        
        # 29. HasTitle
        title_tag = soup.find('title')
        features['HasTitle'] = 1 if title_tag else 0
        
        # 30. Title
        features['Title'] = title_tag.text if title_tag else ''
        
        # 31. DomainTitleMatchScore (Placeholder)
        features['DomainTitleMatchScore'] = 0  # Implement actual logic
        
        # 32. URLTitleMatchScore (Placeholder)
        features['URLTitleMatchScore'] = 0  # Implement actual logic
        
        # 33. HasFavicon
        features['HasFavicon'] = 1 if soup.find('link', rel='icon') else 0
        
        # 34. Robots
        features['Robots'] = 1 if requests.get(url + '/robots.txt').status_code == 200 else 0
        
        # 35. IsResponsive (Simple check)
        features['IsResponsive'] = 1 if soup.find('meta', attrs={'name': 'viewport'}) else 0
        
        # 36. NoOfURLRedirect
        features['NoOfURLRedirect'] = len(response.history)
        
        # 37. NoOfSelfRedirect (Not easy to implement without actual logging)
        features['NoOfSelfRedirect'] = 0  # Implement actual logic
        
        # 38. HasDescription
        features['HasDescription'] = 1 if soup.find('meta', attrs={'name': 'description'}) else 0
        
        # 39. NoOfPopup and 40. NoOfiFrame (Approximation)
        features['NoOfPopup'] = 0  # Implement actual logic
        features['NoOfiFrame'] = len(soup.find_all('iframe'))
        
        # 41. HasExternalFormSubmit
        features['HasExternalFormSubmit'] = 1 if soup.find('form', action=lambda x: x and not x.startswith('/')) else 0
        
        # 42. HasSocialNet
        features['HasSocialNet'] = 1 if soup.find('a', href=lambda x: 'facebook.com' in x or 'twitter.com' in x) else 0
        
        # 43. HasSubmitButton
        features['HasSubmitButton'] = 1 if soup.find('input', type='submit') else 0
        
        # 44. HasHiddenFields
        features['HasHiddenFields'] = 1 if soup.find('input', type='hidden') else 0
        
        # 45. HasPasswordField
        features['HasPasswordField'] = 1 if soup.find('input', type='password') else 0
        
        # 46. Bank, 47. Pay, 48. Crypto (Placeholder)
        features['Bank'] = 1 if 'bank' in url.lower() else 0
        features['Pay'] = 1 if 'pay' in url.lower() else 0
        features['Crypto'] = 1 if 'crypto' in url.lower() else 0
        
        # 49. HasCopyrightInfo
        features['HasCopyrightInfo'] = 1 if soup.find(text=re.compile(r'copyright', re.I)) else 0
        
        # 50. NoOfImage
        features['NoOfImage'] = len(soup.find_all('img'))
        
        # 51. NoOfCSS
        features['NoOfCSS'] = len(soup.find_all('link', rel='stylesheet'))
        
        # 52. NoOfJS
        features['NoOfJS'] = len(soup.find_all('script'))
        
        # 53. NoOfSelfRef (Approximation)
        features['NoOfSelfRef'] = len(soup.find_all('a', href=lambda x: x and x.startswith(url)))
        
        # 54. NoOfEmptyRef
        features['NoOfEmptyRef'] = len(soup.find_all('a', href=''))
        
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    
    df = pd.DataFrame([features])  # Creating DataFrame from features dictionary
    return df

def calculate_continuation_rate(url):
    """
    Calculate the rate of character continuation in the URL.
    This is a simplified version and can be improved.
    """
    continuation_count = sum(url[i] == url[i + 1] for i in range(len(url) - 1))
    return continuation_count / (len(url) - 1) if len(url) > 1 else 0

def clean_url(url):
    # Remove any unwanted characters like tab
    return url.strip()



# Example usage
url = "https://www.southbankmosaics.com	"
url = clean_url(url)
features_df = analyze_url(url)
features_df

Error fetching https://www.southbankmosaics.com: argument of type 'NoneType' is not iterable


Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit
0,www.southbankmosaics.com,https://www.southbankmosaics.com,32,southbankmosaics,24,0,com,0,0.129032,0.9,...,0,0,1,1,0,0,0,0,0,1


In [16]:
features_df.values

array([['www.southbankmosaics.com', 'https://www.southbankmosaics.com',
        32, 'southbankmosaics', 24, 0, 'com', 0, 0.12903225806451613,
        0.9, 0.9, 3, 1, 1, 2, 0.0625, 27, 0.84375, 0, 0.0, 0, 0, 0, 2,
        0.0625, 1, 1282, 9467, 1,
        'ข่าวสด ข่าววันนี้ ข่าวกีฬา ข่าวบันเทิง อัพเดทสดใหม่ทุกวัน – ข่าวสด ข่าวกีฬา ข่าวบันเทิง ข่าววันนี้ อัปเดตข่าวสารรวดเร็วทันใจ พร้อมรับชมสาระน่ารู้ต่างๆ ได้ฟรีตลอด 24ชั่วโมง',
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1]], dtype=object)