In [1]:
import pandas as pd 

In [107]:
legiurl = pd.read_csv('legiurl.csv')
phishurl = pd.read_csv('phishurl.csv')

In [3]:
legiurl

Unnamed: 0,url
0,www.artima.com/intv/guido.html
1,canada.com/calgaryherald/news/calgarybusiness/...
2,shastacosportshof.org/northstate.htm
3,acapella.harmony-central.com/showthread.php?27...
4,models.com/models/Adriana-Lima/news
...,...
4995,oldorthodoxy.wetpaint.com/
4996,nydailynews.com/topics/IMAX%20Corporation
4997,en.wikipedia.org/wiki/Can%27t_Nobody_Hold_Me_Down
4998,klfy.com/


In [117]:
phishurl

Unnamed: 0,url
0,158.255.226.168/js/
1,www.swifthawkstudios.com/~be/paypal/_webscr1.p...
2,herubaba.wapka.me/index.xhtml
3,elserviciodelvino.mx/tiendaserviciodelvino/che...
4,dwostasms.com/seta.apk
...,...
4995,sykonsult.com/bulletin.php
4996,91.239.25.67:6892
4997,senawhlqiyl.biz/apache_handler.php
4998,reaga.cz/olnojnfo


## Feature Extraction:
In this step, features are extracted from the URLs dataset.

## 1. Number of Dots
The number of dots in the URL. Phishing URLs may have an excessive number of dots in an attempt to mimic legitimate domains.

In [4]:
from urllib.parse import urlparse

def num_dots(url):
    return url.count('.')

## 2. Subdomain Level
The number of subdomains in the URL. Phishing URLs often use subdomains to create a false sense of legitimacy.

In [5]:
def subdomain_level(url):
    parsed_url = urlparse(url)
    if parsed_url.hostname:
        subdomains = parsed_url.hostname.split('.')
        return len(subdomains)
    else:
        subdomains = parsed_url.path.split('.')
        return len(subdomains)

## 3. Path Level
The depth of the URL path. Phishing URLs may have longer paths to hide malicious content.

In [6]:
def path_level(url):
    path = urlparse(url).path
    return path.count('/') + 1  # Add 1 to account for the root path

## 4. URL length¶
The overall length of the URL. Phishing URLs may be longer to obfuscate their true nature.

In [7]:
def url_length(url):
    return len(url)

## 5. Number of Dash
The number of dashes in the URL. Phishing URLs may use dashes to imitate legitimate URLs or create confusion.

In [8]:
def num_dash(url):
    return url.count('-')

## 6. Number of Dash in HostName
The number of dashes in the hostname. Phishing URLs may use dashes to mimic legitimate domain names.

In [9]:
def num_dash_in_hostname(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc.count('-') + parsed_url.path.count('-')

## 7. @ symbol
Presence of the '@' symbol in the URL. Phishing URLs might use this to disguise their true nature or imitate login pages.

In [11]:
def at_symbol(url):
    parsed_url = urlparse(url)
    return url.count('@')
    
# url = "https://exam@ple23@4523@.com/@"
# at_symbol(url)

## 8. ~ symbol
Presence of the '~' symbol in the URL. Phishing URLs may use uncommon symbols to evade detection or mimic legitimate URLs.

In [15]:
def tilde_symbol(url):
    parsed_url = urlparse(url)
    return url.count('~')

# url = "e~xample234~523.com/alkjflkj~"
# tilde_symbol(url)

## 9. Number of underscore _
The number of underscores in the URL. Phishing URLs may use underscores to simulate legitimate URLs.

In [16]:
def num_underscore(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc.count('_') + parsed_url.path.count('_') + parsed_url.query.count('_') + parsed_url.fragment.count('_') 
    

# url = "example_23_45_23_.com"
# num_underscore(url)

## 10. Number of %
The number of percent symbols in the URL. Phishing URLs may use percent encoding to hide malicious content.

In [17]:
def num_percent(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc.count('%') + parsed_url.path.count('%') + parsed_url.query.count('%') + parsed_url.fragment.count('%') 

# url = "example%23%45%23.com"
# num_percent(url)

## 11. Number of Query Components
The number of components in the query string. Phishing URLs might use complex query strings to obfuscate their purpose.

In [18]:
from urllib.parse import urlparse, parse_qs
import re
import ipaddress

def num_query_components(url):
    parsed_url = urlparse(url)
    return len(parse_qs(parsed_url.query))

## 12. Number of '&' symbol
The number of ampersands in the URL. Phishing URLs may use multiple ampersands in the query string to confuse users.

In [20]:
def num_ampersand(url):
    return url.count('&')

## 13. Number of '#' symbols
The number of hash symbols in the URL. Phishing URLs might use hash symbols to hide parameters or fragments.

In [21]:
def num_hash(url):
    return url.count('#')

## 14. Number of Numeric Characters
The number of numeric characters in the URL. Phishing URLs may include random numbers to evade detection.

In [22]:
def num_numeric_chars(url):
    return sum(c.isdigit() for c in url)

## 15. No HTTPS
Indicates if the URL doesn't use HTTPS. Phishing URLs may lack secure connections to impersonate legitimate sites.

In [23]:
def no_https(url):
    parsed_url = urlparse(url)
    if (parsed_url.scheme=="https" or parsed_url.scheme==""):
        return 0
    else:
        return 1

# url = "google.com"
# no_https(url)

## 16. Random String
Presence of a random-looking string in the URL. Phishing URLs might use random strings to disguise their identity.

In [24]:
import re

def random_string(url):
    # Check if the URL contains a random-looking string
    return 1 if bool(re.search(r'[0-9a-f]{8}[-]?([0-9a-f]{4}[-]?){3}[0-9a-f]{12}', url, re.I)) else 0

# url = "googel.com"
# result = random_string(url)
# print(result)

## 17. IP address
Presence of an IP address in the URL. Phishing URLs may use IP addresses to bypass domain-based blacklists.

In [26]:
import re

def ip_address(url):
    
    url_domain = re.search(r'(?:https?://)?([^/]+)', url).group(1)

    # Define regular expressions
    ip_pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
    ipv4_pattern = re.compile(r'(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9]?[0-9])(\.|$){4}')
    hex_ipv4_pattern = re.compile(r'(0x([0-9][0-9]|[A-F][A-F]|[A-F][0-9]|[0-9][A-F]))(\.|$){4}')

    # Check if the URL domain matches any pattern
    if ip_pattern.match(url_domain) or ipv4_pattern.match(url_domain) or hex_ipv4_pattern.match(url_domain):
        result = 1
    else:
        result = 0
        
    return result
        
        
# url = "https://192.168.1.1"
# ip_address(url)

## 18. HTTPS in hostname
Presence of 'https' in the hostname. Phishing URLs may include 'https' to deceive users about the security of the site.

In [35]:
from urllib.parse import urlparse

def https_in_hostname(url):
    parsed_url = urlparse(url)
    return 1 if 'https' in parsed_url.netloc or 'https' in parsed_url.path else 0

# url = "google.com/jahsdfh/https"
# result = https_in_hostname(url)
# print(result)

## 19. Hostname Length
The length of the hostname. Phishing URLs may use longer hostnames to imitate legitimate sites.

In [38]:
def hostname_length(url):
    parsed_url = urlparse(url)
    if parsed_url.hostname:
        return len(parsed_url.hostname)
    else: 
        return len(parsed_url.path.split('/')[0])
    
# url = "google.com"
# hostname_length(url)

## 20. Path Length
The length of the URL path. Phishing URLs may have longer paths to hide malicious content.

In [40]:
def path_length(url):
    parsed_url = urlparse(url)
    if parsed_url.hostname:
        return len(parsed_url.path)
    else: 
        return (len(parsed_url.path.split('/', 1)[-1])+1)

url = "https://google.com/hello"
path_length(url)

6

## 21. Query Length
The length of the query string. Phishing URLs may use longer queries to obfuscate their true purpose.

In [42]:
def query_length(url):
    return len(urlparse(url).query)

# url = "https://example.com/?query=books"
# query_length(url)

## 22. Double slash in path
Presence of consecutive slashes in the URL path. Phishing URLs may use double slashes to mimic legitimate paths.

In [44]:
def double_slash_in_path(url):
    return 1 if '//' in urlparse(url).path else 0

# url = "google.com//hello"
# double_slash_in_path(url)

## 23. Using URL Shortening Services “TinyURL”
URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an “HTTP Redirect” on a domain name that is short, which links to the webpage that has a long URL.

If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).

In [48]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

## 24. IFrame Redirection

IFrame is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders. In this regard, phishers make use of the “frameBorder” attribute which causes the browser to render a visual delineation.

If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (Phishing) or else 0 (Legitimate).

In [97]:
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

## 25. Status Bar Customization

Phishers may use JavaScript to show a fake URL in the status bar to users. To extract this feature, we must dig-out the webpage source code, particularly the “onMouseOver” event, and check if it makes any changes on the status bar

If the response is empty or onmouseover is found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).

In [95]:
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

# Computing URL Features
Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.

In [108]:
#Function to extract features
def featureExtraction(url,label):

  features = []

  
  features.append(num_dots(url))
  features.append(subdomain_level(url))
  features.append(path_level(url))
  features.append(url_length(url))
  features.append(num_dash(url))
  features.append(num_dash_in_hostname(url))
  features.append(at_symbol(url))
  features.append(tilde_symbol(url))
  features.append(num_underscore(url))
  features.append(num_percent(url))
  features.append(num_query_components(url))
  features.append(num_ampersand(url))
  features.append(num_hash(url))
  features.append(num_numeric_chars(url))
  features.append(no_https(url))
  features.append(random_string(url))
  features.append(ip_address(url))
  features.append(https_in_hostname(url))
  features.append(hostname_length(url))
  features.append(path_length(url))
  features.append(query_length(url))
  features.append(double_slash_in_path(url))
  features.append(tinyURL(url))

  try:
    response = requests.get(url)
  except:
    response = ""
    
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(label)
  
  return features

## Legitimate URLs:
Now, feature extraction is done on legitimate URLs.

In [109]:
#Extracting the feautres & storing them in a list
legi_features = []
label = 0

for i in range(0, 5000):
  url = legiurl["url"][i]
  legi_features.append(featureExtraction(url,label))

In [110]:
#converting the list to dataframe
feature_names = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 'HttpsInHostname', 'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 'TinyURL', 'iFrame', 'Mouse_Over', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)

In [111]:
legitimate.head()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IpAddress,HttpsInHostname,HostnameLength,PathLength,QueryLength,DoubleSlashInPath,TinyURL,iFrame,Mouse_Over,Label
0,3,4,3,30,0,0,0,0,0,0,...,0,0,14,16,0,0,0,1,1,0
1,2,3,5,96,4,0,0,0,0,0,...,0,0,10,46,39,0,0,1,1,0
2,2,3,2,36,0,0,0,0,0,0,...,0,0,21,15,0,0,0,1,1,0
3,3,4,2,79,6,1,0,0,0,0,...,0,0,28,15,35,0,0,1,1,0
4,1,2,4,35,1,1,0,0,0,0,...,0,0,10,25,0,0,0,1,1,0


## Phishing URLs:
Now, feature extraction is performed on phishing URLs.

In [112]:
phish_features = []
label = 1
for i in range(0, 5000):
    url = phishurl['url'][i]
    phish_features.append(featureExtraction(url,label))

In [113]:
#converting the list to dataframe
feature_names = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 'HttpsInHostname', 'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 'TinyURL', 'iFrame', 'Mouse_Over', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)

In [114]:
phishing.head()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IpAddress,HttpsInHostname,HostnameLength,PathLength,QueryLength,DoubleSlashInPath,TinyURL,iFrame,Mouse_Over,Label
0,3,4,3,19,0,0,0,0,0,0,...,1,0,15,4,0,0,0,1,1,1
1,3,4,4,195,1,0,0,1,4,0,...,0,0,24,24,146,0,0,1,1,1
2,3,4,2,29,0,0,0,0,0,0,...,0,0,17,12,0,0,0,1,1,1
3,2,3,5,249,0,0,0,0,0,0,...,0,0,20,73,155,0,0,1,1,1
4,2,3,2,22,0,0,0,0,0,0,...,0,0,13,9,0,0,0,1,1,1


# Final Dataset
In the above section we formed two dataframes of legitimate & phishing URL features. Now, we will combine them to a single dataframe and export the data to csv file for the Machine Learning training done in other notebook.

In [115]:
#Concatenating the dataframes into one 
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
urldata.head()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,IpAddress,HttpsInHostname,HostnameLength,PathLength,QueryLength,DoubleSlashInPath,TinyURL,iFrame,Mouse_Over,Label
0,3,4,3,30,0,0,0,0,0,0,...,0,0,14,16,0,0,0,1,1,0
1,2,3,5,96,4,0,0,0,0,0,...,0,0,10,46,39,0,0,1,1,0
2,2,3,2,36,0,0,0,0,0,0,...,0,0,21,15,0,0,0,1,1,0
3,3,4,2,79,6,1,0,0,0,0,...,0,0,28,15,35,0,0,1,1,0
4,1,2,4,35,1,1,0,0,0,0,...,0,0,10,25,0,0,0,1,1,0


In [118]:
urldata.shape

(10000, 26)

In [119]:
urldata.to_csv('urldata.csv', index= False)