In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import urlparse
from utils.data_utils import *
import re
import os

# Create the EDA directory if it doesn't exist
os.makedirs('EDA', exist_ok=True)

# List of datasets to import
datasets = ["phiusiil", "malicious_phish", "url_dataset", "combined"]

# Loop through each dataset type
for to_import in datasets:
    # Import dataset
    if to_import == "phiusiil":
        train, validation, benchmark = import_phiusiil()
    elif to_import == "malicious_phish":
        train, validation, benchmark = import_malicious_phish()
    elif to_import == "url_dataset":
        train, validation, benchmark = import_url_dataset()
    elif to_import == "combined":
        train, validation, benchmark = import_combined(fraction=0.02)

    # Load the dataset
    df = train

    # Display the first few rows of the dataset
    print(f"Processing dataset: {to_import}")
    print(df.head())

    # Feature Extraction Function
    def extract_features(url):
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        # Calculate features
        url_length = len(url)
        domain_length = len(domain)
        subdomain_count = domain.count('.') - 1  # Count dots to estimate subdomains
        path_length = len(parsed_url.path)
        query_param_count = len(parsed_url.query.split('&')) if parsed_url.query else 0
        is_https = 1 if parsed_url.scheme == 'https' else 0
        special_char_count = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', url))
        is_ip_address = 1 if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain) else 0
        last_path_segment_length = len(parsed_url.path.split('/')[-1]) if parsed_url.path else 0
        
        # Check for suspicious keywords
        suspicious_keywords = ['login', 'secure', 'update', 'account', 'verify']
        contains_suspicious_keyword = any(keyword in url.lower() for keyword in suspicious_keywords)

        return pd.Series([
            url_length,
            domain_length,
            subdomain_count,
            path_length,
            query_param_count,
            is_https,
            special_char_count,
            is_ip_address,
            last_path_segment_length,
            int(contains_suspicious_keyword)
        ])

    # Apply feature extraction to each URL
    df[['url_length', 'domain_length', 'subdomain_count', 'path_length',
        'query_param_count', 'is_https', 'special_char_count',
        'is_ip_address', 'last_path_segment_length', 'contains_suspicious_keyword']] = df['url'].apply(extract_features)

    # Create the EDA directory if it doesn't exist
    os.makedirs('EDA', exist_ok=True)

    # Create a subfolder for the specific dataset
    dataset_folder = f'EDA/{to_import}'
    os.makedirs(dataset_folder, exist_ok=True)

    # Visualizing URL Length Distribution and saving the plot
    plt.figure(figsize=(10, 6))
    sns.histplot(df['url_length'], bins=30, kde=True)
    plt.title(f'Distribution of URL Lengths ({to_import})')
    plt.xlabel('Length of URL')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.savefig(f'{dataset_folder}/distribution_url_lengths_{to_import}.png')  # Save plot in specific dataset folder
    plt.close()  # Close the plot to free memory

    # Set limits for x-axis based on significance for each feature
    x_limits = {
        'last_path_segment_length': (0, 5),  # Adjust these limits based on your data analysis
        'special_char_count': (0, 10),        # Adjust these limits based on your data analysis
        'domain_length': (0, 30),             # Adjust these limits based on your data analysis
        'path_length': (0, 50)                # Adjust these limits based on your data analysis
    }

    # Visualizing counts for new features and saving each plot with x-axis limits applied
    feature_columns = ['domain_length', 'subdomain_count', 'path_length', 
                       'query_param_count', 'is_https', 
                       'special_char_count', 'is_ip_address',
                       'last_path_segment_length', 
                       'contains_suspicious_keyword']

    for feature in feature_columns:
        plt.figure(figsize=(10, 6))
        
        if feature in x_limits:
            sns.countplot(x=feature, data=df)
            plt.xlim(x_limits[feature])  # Set x-axis limits only for specified features
        else:
            sns.countplot(x=feature, data=df)

        plt.title(f'Count of {feature} ({to_import})')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.grid(True)
        
        plt.savefig(f'{dataset_folder}/count_{feature}_{to_import}.png')  # Save plot in specific dataset folder
        plt.close()  # Close the plot to free memory

print("Plots saved in the EDA folder.")

Processing dataset: phiusiil
                                         url  label
0                      https://www.gdc.co.ke      0
1               https://www.astrotrishla.com      0
2                      https://www.npstc.org      0
3  https://www.harrisburgregionalchamber.org      0
4                https://www.cincymuseum.org      0
Processing dataset: malicious_phish
                                                 url  label
0  firedepartmentdirectory.com/location/County-Fi...      0
1  sopfeu.qc.ca/en/etat_de_la_situation/danger_in...      0
2               filecatch.com/?q=zuzana+rock+body+tv      0
3  http://tobogo.net/cdsb/board.php?board=storyan...      0
4   kindomain2.com/3a6a5a4bf24172a7bbf51eae887700ad/      0
Processing dataset: url_dataset
                                                 url  label
0  https://www.countrydiscography.blogspot.com/20...      0
1   https://www.en.wikipedia.org/wiki/Peter_Bronfman      0
2  https://www.atlanticbassmasters.com/Lake%20Rec..