In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("phishing_site_urls.csv")

In [3]:
df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [4]:
import re
from urllib.parse import urlparse

# Load your dataset
df_urls = pd.read_csv("phishing_site_urls.csv")

# Safe feature extraction
def safe_extract_features(url):
    try:
        if not url.startswith('http'):
            url = 'http://' + url
        parsed = urlparse(url)
        hostname = parsed.hostname or ""
        path = parsed.path

        return [
            url.count('.'),
            len(url),
            url.count('-'),
            1 if '@' in url else 0,
            1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", hostname) else 0,
            1 if 'https' in hostname else 0,
            path.count('/'),
            len(path),
            len(re.findall(r'\d', url))
        ]
    except Exception:
        return None

# Apply and filter out invalid URLs
features_list = df_urls['URL'].apply(safe_extract_features)
features_df = pd.DataFrame([f for f in features_list if f is not None], columns=[
    'NumDots', 'UrlLength', 'NumDash', 'AtSymbol', 'IpAddress',
    'HttpsInHostname', 'PathLevel', 'PathLength', 'NumNumericChars'
])

# Add label
labels = df_urls.loc[features_list.notnull(), 'Label'].apply(lambda x: 1 if x == 'bad' else 0)
features_df['Phishing'] = labels.values

   NumDots  UrlLength  NumDash  AtSymbol  IpAddress  HttpsInHostname  \
0        6        232        4         0          0                0   
1        5         88        2         0          0                0   
2        7        184        1         0          0                0   
3        6         67        0         0          0                0   
4        1        123        1         0          0                0   

   PathLevel  PathLength  NumNumericChars  Phishing  
0          8         125               58         1  
1          4          66                1         1  
2         11         161               47         1  
3          2          42                0         1  
4          7          60               21         1  


In [5]:
features_df.head()

Unnamed: 0,NumDots,UrlLength,NumDash,AtSymbol,IpAddress,HttpsInHostname,PathLevel,PathLength,NumNumericChars,Phishing
0,6,232,4,0,0,0,8,125,58,1
1,5,88,2,0,0,0,4,66,1,1
2,7,184,1,0,0,0,11,161,47,1
3,6,67,0,0,0,0,2,42,0,1
4,1,123,1,0,0,0,7,60,21,1


In [6]:
features_df.to_csv("phishing_dataset.csv", index=False)