This notebook extracts the features of the phishing URLs in the phishing.csv file. The extracted features are then stored in the phishing_extracted.csv file. 

In [2]:
import pandas as pd

In [3]:
# importing required packages for this section
from urllib.parse import urlparse, urlencode
import ipaddress
import re
import requests

In [4]:
%pip install python-whois


Collecting python-whois
  Using cached python_whois-0.8.0-py3-none-any.whl
Installing collected packages: python-whois
Successfully installed python-whois-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [None]:
def getDomain(url):
  domain = urlparse(url).netloc
  if re.match(r"^www.", domain):
      domain = domain.replace("www.", "")
  return domain

1.Presence of Special Symbols

In [None]:
def special_symbols(url):
    if '@' in url:
        return 1
    else:
        return 0

2.Presence of sub-domains

In [None]:
def check_subdomain(url):
    count = 0
    for i in url:
        if i == '.':
            count+=1
    if count<=3:
        return 0
    else:
        return 1


3.looking for '-' in domain

In [None]:
def prefixSuffix(url):
    domain = urlparse(url).netloc
    if '-' in domain or '_' in domain:
        return 1            # phishing
    else:
        return 0


4.Using IP instead of URL

In [None]:
def havingIP(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip=0
    return ip

5.Depth of URL

In [None]:
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth


6.Redirections in URL

In [None]:
def redirect(url):
    pos = url.rfind("//")
    if pos>7:
        return 1
    else:
        return 0


7.URL shortening services

In [None]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"


In [None]:
def tinyURL(url):
    match = re.search(shortening_services, url)
    if match:
        return 1
    else:
        return 0

8. Number of digits in URL

In [None]:
def DigitCount(url):
  count=0
  for i in url:
    if i.isdigit():
      count+=1
  return count

9.Web Traffic

In [None]:
def web_traffic(url_in):
  try:
    if 'http' not in url_in and 'https' not in url_in:
      url_in='http://'+url_in

    domain_name = whois.whois(urlparse(url_in).netloc).domain_name
    if(type(domain_name) is list ):
      domain_name=domain_name[1].lower()
    if domain_name.isupper():
      domain_name=domain_name.lower()
    r = requests.get('https://siterankdata.com/'+domain_name)
    soup = BeautifulSoup(r.text,'html.parser')
    res = str(soup.find_all('meta')[3])

    a = res.split()[7].split('.')[0]
    rank=int(a)
  except:
      return 1
  if rank > 1000000:
    return 1
  else:
    return 0


10.Domain age

In [None]:
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date, str) or isinstance(expiration_date, str)):
    try:
      creation_date = datetime.strptime(creation_date, "%Y-%m-%d")
      expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
    try:
      ageofdomain = abs((expiration_date[0]-creation_date[0]).days)
    except:
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
  if ((ageofdomain/30) < 6):
    age = 1
  else:
    age = 0
  return age


Processing the URL's

In [None]:
#Function to extract features
def featureExtraction(url,label):
  if 'http' not in url and 'https' not in url:
    url='http://'+url
  v = web_traffic(url)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1
  
  features = []
  #Address bar based features (11)
  features.append(getDomain(url)) 
  features.append(v)
  features.append(special_symbols(url))
  features.append(check_subdomain(url))
  features.append(prefixSuffix(url))
  features.append(havingIP(url))
  features.append(getDepth(url))
  features.append(redirect(url))
  features.append(tinyURL(url))
  features.append(DigitCount(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(label)
  
  return features

Phishing URLS

In [None]:
# Extracting the feautres & storing them in a list
phishurl = pd.read_csv('phishing.csv')
phish_features = []
label = 1
for i in range(0, 10000):
  url = phishurl['url'][i]
  phish_features.append(featureExtraction(url, label))
  print(i)


9999


In [None]:
#converting the list to dataframe
feature_names = ['Domain', 'Web_Traffic', 'Have_At', 'SubDomain','Prefix/Suffix', 'Have_IP', 'URL_Depth', 'Redirection',
                 'TinyURL', 'Digit_count','Domain_Age', 'Label']

phishing = pd.DataFrame(phish_features, columns=feature_names)
phishing.head()

Unnamed: 0,Domain,Web_Traffic,Have_At,SubDomain,Prefix/Suffix,Have_IP,URL_Depth,Redirection,TinyURL,Digit_count,Domain_Age,Label
0,rodriguezq.cf,1,0,0,0,0,1,0,0,0,1,1
1,dev-juyjnuytrhbrg.pantheonsite.io,1,0,0,1,0,0,0,0,0,1,1
2,dev-juyjnuytrhbrg.pantheonsite.io,1,0,0,1,0,0,0,0,0,1,1
3,reactivaciononlineysegura.seguridadonlin.repl.co,1,0,0,0,0,0,0,0,0,1,1
4,reactivaciononlineysegura--seguridadonlin.repl.co,1,0,0,1,0,0,0,0,0,1,1


In [None]:
phishing.to_csv('phishing_extracted.csv', index=False)