# 🛡️ What is PhishStorm?


PhishStorm is a system that helps catch phishing websites in real time—before people fall for them. Phishing sites are fake websites made to trick you into giving away your personal info (like your bank login or PayPal password).



In [20]:
import pandas as pd

df = pd.read_csv(
    '/Users/inmilk306/Documents/GitHub_polinacsv/detecting-phishing-URLs/data/urlset.csv', 
    encoding='ISO-8859-1',
    on_bad_lines='skip'  # Skip all malformed lines
)

  df = pd.read_csv(


In [21]:
df.shape

(96005, 14)

In [3]:
# Take a random sample of 100 rows
sample_df = df.sample(n=100, random_state=42)

# Save to CSV
sample_df.to_csv('/Users/inmilk306/Documents/GitHub_polinacsv/detecting-phishing-URLs/data/sample_100.csv', index=False)

In [4]:
df.head()

Unnamed: 0,domain,ranking,mld_res,mld.ps_res,card_rem,ratio_Rrem,ratio_Arem,jaccard_RR,jaccard_RA,jaccard_AR,jaccard_AA,jaccard_ARrd,jaccard_ARrem,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,10000000,1.0,0.0,18.0,107.611111,107.277778,0.0,0.0,0.0,0.0,0.8,0.795729,1.0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,10000000,0.0,0.0,11.0,150.636364,152.272727,0.0,0.0,0.0,0.0,0.0,0.768577,1.0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,10000000,0.0,0.0,14.0,73.5,72.642857,0.0,0.0,0.0,0.0,0.0,0.726582,1.0
3,mail.printakid.com/www.online.americanexpress....,10000000,0.0,0.0,6.0,562.0,590.666667,0.0,0.0,0.0,0.0,0.0,0.85964,1.0
4,thewhiskeydregs.com/wp-content/themes/widescre...,10000000,0.0,0.0,8.0,29.0,24.125,0.0,0.0,0.0,0.0,0.0,0.748971,1.0


In [5]:
df.info()
df['label'].value_counts()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96005 entries, 0 to 96004
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   domain         96005 non-null  object 
 1   ranking        95953 non-null  object 
 2   mld_res        95935 non-null  object 
 3   mld.ps_res     95924 non-null  object 
 4   card_rem       95923 non-null  float64
 5   ratio_Rrem     95923 non-null  float64
 6   ratio_Arem     95923 non-null  float64
 7   jaccard_RR     95922 non-null  float64
 8   jaccard_RA     95921 non-null  float64
 9   jaccard_AR     95920 non-null  float64
 10  jaccard_AA     95919 non-null  float64
 11  jaccard_ARrd   95919 non-null  object 
 12  jaccard_ARrem  95917 non-null  object 
 13  label          95913 non-null  float64
dtypes: float64(8), object(6)
memory usage: 10.3+ MB


label
0.0    48009
1.0    47904
Name: count, dtype: int64

In [None]:
from detecting_phishing_urls.featuresold import extract_url_features

df_with_features = extract_url_features(df, url_col="domain")
df_with_features.head()

Unnamed: 0,domain,ranking,mld_res,mld.ps_res,card_rem,ratio_Rrem,ratio_Arem,jaccard_RR,jaccard_RA,jaccard_AR,...,num_digits,num_special_chars,has_ip,num_subdomains,num_dots,entropy,has_https,has_at_symbol,has_dash,path_length
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,10000000,1.0,0.0,18.0,107.611111,107.277778,0.0,0.0,0.0,...,58,32,0,-1,6,5.026886,0,0,1,134
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,10000000,0.0,0.0,11.0,150.636364,152.272727,0.0,0.0,0.0,...,1,15,0,-1,5,4.686883,0,0,1,81
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,10000000,0.0,0.0,14.0,73.5,72.642857,0.0,0.0,0.0,...,47,19,0,-1,7,4.721044,0,0,1,177
3,mail.printakid.com/www.online.americanexpress....,10000000,0.0,0.0,6.0,562.0,590.666667,0.0,0.0,0.0,...,0,8,0,-1,6,4.079842,0,0,0,60
4,thewhiskeydregs.com/wp-content/themes/widescre...,10000000,0.0,0.0,8.0,29.0,24.125,0.0,0.0,0.0,...,21,13,0,-1,1,4.608653,0,0,1,79


### 📊 URL Feature Descriptions

| Column            | Description |
|-------------------|-------------|
| `domain`          | The full domain name of the URL |
| `J_RR`            | Jaccard similarity between related words (domain vs. rest) |
| `J_RA`            | Jaccard similarity between related (domain) and associated (rest) |
| `J_AA`            | Jaccard similarity between associated words (domain vs. rest) |
| `J_AR`            | Jaccard similarity between associated (domain) and related (rest) |
| `J_ARrd`          | Jaccard similarity between associated and related (domain only) |
| `J_ARrem`         | Jaccard similarity between associated and related (rest only) |
| `card_rem`        | Number of words in the rest of the URL |
| `ratio_Arem`      | Fraction of associated words in the rest of the URL |
| `ratio_Rrem`      | Fraction of related words in the rest of the URL |
| `mld_res`         | Is the domain found in a search engine? (0/1) |
| `mld.ps_res`      | Is the domain's public suffix found in a search engine? |
| `ranking`         | Alexa ranking of the domain |
| `url_length`      | Total number of characters in the full URL |
| `num_digits`      | Number of digits in the URL |
| `num_special_chars` | Number of special characters in the URL |
| `has_ip`          | Does the URL use an IP address? (1 if yes) |
| `num_subdomains`  | Number of subdomains in the URL |
| `num_dots`        | Number of periods (.) in the URL |
| `entropy`         | Shannon entropy of the URL string (higher = more random) |
| `has_https`       | Does it use HTTPS protocol? (1 if yes) |
| `has_at_symbol`   | Contains an '@' symbol? (often suspicious) |
| `has_dash`        | Contains a '-' dash? |
| `path_length`     | Length of the path after the domain |
| `label`           | Target: 1 = phishing, 0 = not phishing |


In [8]:
import re

# Use only domain and label
df = df[['domain', 'label']].copy()

# Feature engineering functions
def count_digits(url):
    return sum(c.isdigit() for c in url)

def count_special_chars(url):
    return len(re.findall(r'[^a-zA-Z0-9]', url))

def has_ip_address(url):
    return int(bool(re.search(r'\b\d{1,3}(\.\d{1,3}){3}\b', url)))

def count_subdomains(url):
    domain_part = url.split('/')[0]
    return domain_part.count('.')

def has_suspicious_words(url):
    keywords = ['login', 'verify', 'secure', 'account', 'update', 'ebay', 'paypal', 'bank']
    return int(any(word in url.lower() for word in keywords))

# Apply feature engineering
df['url_length'] = df['domain'].str.len()
df['num_digits'] = df['domain'].apply(count_digits)
df['num_special_chars'] = df['domain'].apply(count_special_chars)
df['has_ip'] = df['domain'].apply(has_ip_address)
df['num_subdomains'] = df['domain'].apply(count_subdomains)
df['has_suspicious_words'] = df['domain'].apply(has_suspicious_words)

# Show the engineered features

df.head()


Unnamed: 0,domain,label,url_length,num_digits,num_special_chars,has_ip,num_subdomains,has_suspicious_words
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1.0,225,58,32,0,1,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1.0,81,1,15,0,2,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1.0,177,47,19,0,1,1
3,mail.printakid.com/www.online.americanexpress....,1.0,60,0,8,0,2,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,1.0,116,21,13,0,1,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define features and target
features = ['url_length', 'num_digits', 'num_special_chars', 'has_ip', 'num_subdomains', 'has_suspicious_words']


# Drop rows where 'label' is missing (just in case)
df_cleaned = df.dropna(subset=['label']).copy()

# Re-run the model pipeline with the cleaned data
X = df_cleaned[features]
y = df_cleaned['label']

# Split and train again
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Generate updated report
report_cleaned = classification_report(y_test, y_pred, output_dict=True)
conf_matrix_cleaned = confusion_matrix(y_test, y_pred)

report_cleaned, conf_matrix_cleaned


({'0.0': {'precision': 0.8388204981391354,
   'recall': 0.9154342845240575,
   'f1-score': 0.8754544096409542,
   'support': 9602},
  '1.0': {'precision': 0.9067095588235294,
   'recall': 0.8237135998330029,
   'f1-score': 0.8632212195788898,
   'support': 9581},
  'accuracy': 0.8696241463796069,
  'macro avg': {'precision': 0.8727650284813324,
   'recall': 0.8695739421785302,
   'f1-score': 0.869337814609922,
   'support': 19183},
  'weighted avg': {'precision': 0.8727278687494247,
   'recall': 0.8696241463796069,
   'f1-score': 0.869344510564447,
   'support': 19183}},
 array([[8790,  812],
        [1689, 7892]]))

In [10]:
# Check for missing values in the target column 'label'
missing_labels = df['label'].isna().sum()

# Display how many missing labels exist
missing_labels

92

In [12]:
from urllib.parse import urlparse, unquote
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
import numpy as np

# Helper functions to extract and split words
def extract_parts(url):
    parsed = urlparse('http://' + url)  # prepend scheme to avoid parsing issues
    domain = parsed.hostname or ''
    path = parsed.path + parsed.query
    return domain, path

def tokenize(text):
    # Remove URL encoding and split on non-alphanumeric characters
    text = unquote(text)
    tokens = re.split(r'[^a-zA-Z0-9]', text.lower())
    return [token for token in tokens if token]

# New features
def compute_jaccard(domain, path):
    domain_tokens = set(tokenize(domain))
    path_tokens = set(tokenize(path))
    if not domain_tokens or not path_tokens:
        return 0.0
    return len(domain_tokens & path_tokens) / len(domain_tokens | path_tokens)

def compute_overlap(domain, path):
    domain_tokens = set(tokenize(domain))
    path_tokens = set(tokenize(path))
    return len(domain_tokens & path_tokens)

def has_brand_conflict(domain, path):
    brand_words = ['paypal', 'bank', 'ebay', 'secure', 'login', 'account', 'signin']
    domain_tokens = tokenize(domain)
    path_tokens = tokenize(path)
    return int(any(word in path_tokens and word not in domain_tokens for word in brand_words))

# Apply feature engineering
df_cleaned[['domain_part', 'path_part']] = df_cleaned['domain'].apply(lambda url: pd.Series(extract_parts(url)))
df_cleaned['jaccard_domain_path'] = df_cleaned.apply(lambda row: compute_jaccard(row['domain_part'], row['path_part']), axis=1)
df_cleaned['overlap_count'] = df_cleaned.apply(lambda row: compute_overlap(row['domain_part'], row['path_part']), axis=1)
df_cleaned['has_brand_conflict'] = df_cleaned.apply(lambda row: has_brand_conflict(row['domain_part'], row['path_part']), axis=1)

# Display new features

df_cleaned[['jaccard_domain_path', 'overlap_count', 'has_brand_conflict']].describe()


Unnamed: 0,jaccard_domain_path,overlap_count,has_brand_conflict
count,95913.0,95913.0,95913.0
mean,0.01252,0.140878,0.165473
std,0.042912,0.475267,0.371609
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,12.0,1.0


In [14]:
# Combine earlier basic features with new relatedness-inspired ones
all_features = features + ['jaccard_domain_path', 'overlap_count', 'has_brand_conflict']
X_all = df_cleaned[all_features]
y_all = df_cleaned['label']

# Train/test split and model fitting
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, stratify=y_all, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_all, y_train_all)

# Get feature importances
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)


feature_importance_df


Unnamed: 0,feature,importance
0,url_length,0.283969
5,has_suspicious_words,0.190377
2,num_special_chars,0.178656
4,num_subdomains,0.154402
1,num_digits,0.125575
8,has_brand_conflict,0.056914
6,jaccard_domain_path,0.006649
7,overlap_count,0.002932
3,has_ip,0.000526


In [15]:
full_df = pd.read_csv(
    '/Users/inmilk306/Documents/GitHub_polinacsv/detecting-phishing-URLs/data/urlset.csv', 
    encoding='ISO-8859-1',
    on_bad_lines='skip'  # Skip all malformed lines
)

  full_df = pd.read_csv(


In [16]:
# Drop missing labels
full_df = full_df.dropna(subset=['label'])

# Convert any non-numeric columns that should be numeric (e.g., jaccard_ARrd, jaccard_ARrem)
columns_to_convert = [
    'ranking', 'mld_res', 'mld.ps_res',
    'jaccard_ARrd', 'jaccard_ARrem'
]

for col in columns_to_convert:
    full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

# Select only original PhishStorm-style features
phishstorm_features = [
    'ranking', 'mld_res', 'mld.ps_res', 'card_rem', 'ratio_Rrem', 'ratio_Arem',
    'jaccard_RR', 'jaccard_RA', 'jaccard_AR', 'jaccard_AA', 'jaccard_ARrd', 'jaccard_ARrem'
]

# Drop rows with missing values in selected features
full_df_clean = full_df.dropna(subset=phishstorm_features + ['label'])

# Prepare X and y
X_phishstorm = full_df_clean[phishstorm_features]
y_phishstorm = full_df_clean['label']

# Train/test split
X_train_ps, X_test_ps, y_train_ps, y_test_ps = train_test_split(
    X_phishstorm, y_phishstorm, stratify=y_phishstorm, test_size=0.2, random_state=42)

# Train Random Forest
rf_ps_model = RandomForestClassifier(random_state=42)
rf_ps_model.fit(X_train_ps, y_train_ps)

# Feature importances
ps_importances = rf_ps_model.feature_importances_
ps_feature_importance_df = pd.DataFrame({
    'feature': phishstorm_features,
    'importance': ps_importances
}).sort_values(by='importance', ascending=False)


ps_feature_importance_df

Unnamed: 0,feature,importance
3,card_rem,0.209673
0,ranking,0.172352
11,jaccard_ARrem,0.120908
4,ratio_Rrem,0.120496
5,ratio_Arem,0.115098
10,jaccard_ARrd,0.089973
1,mld_res,0.085136
9,jaccard_AA,0.025314
6,jaccard_RR,0.01897
8,jaccard_AR,0.017757
