In [3]:
import pandas as pd

df = pd.read_csv("../data/processed/clean_urls.csv")

df.head()


Unnamed: 0,URL,Label,label_numeric,url_length
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,1,225
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,1,81
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,1,177
3,mail.printakid.com/www.online.americanexpress....,bad,1,60
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,1,116


In [4]:
import re
import math
from urllib.parse import urlparse

def extract_features(url):
    features = {}

    features['url_length'] = len(url)
    features['dot_count'] = url.count('.')
    features['hyphen_count'] = url.count('-')
    features['digit_count'] = sum(c.isdigit() for c in url)
    features['slash_count'] = url.count('/')
    features['has_at'] = 1 if '@' in url else 0
    features['has_https'] = 1 if url.startswith('https') else 0

    # IP address detection
    features['has_ip'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0

    # Subdomain count
    parsed = urlparse(url)
    features['subdomain_count'] = parsed.netloc.count('.') - 1

    # Entropy
    prob = [float(url.count(c)) / len(url) for c in set(url)]
    features['entropy'] = -sum(p * math.log2(p) for p in prob)

    return features


In [5]:
sample_df = df.sample(5000, random_state=42)

features_df = sample_df['URL'].apply(extract_features).apply(pd.Series)

final_df = pd.concat(
    [features_df, sample_df['label_numeric']],
    axis=1
)

final_df.head()


Unnamed: 0,url_length,dot_count,hyphen_count,digit_count,slash_count,has_at,has_https,has_ip,subdomain_count,entropy,label_numeric
498033,31.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,-1.0,3.977917,1
124823,24.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,3.803509,1
38799,142.0,14.0,0.0,39.0,2.0,0.0,0.0,0.0,-1.0,4.704061,1
17623,76.0,2.0,1.0,0.0,6.0,0.0,0.0,0.0,-1.0,4.379332,1
62282,25.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,-1.0,3.543465,0


In [6]:
features_df = df['URL'].apply(extract_features).apply(pd.Series)

final_df = pd.concat(
    [features_df, df['label_numeric']],
    axis=1
)


In [7]:
final_df.to_csv("../data/processed/urls_with_features.csv", index=False)
