# Phishing Website Detection Feature Extraction

The objective of this notebook is to extract the selctive features form the URLs.

In [12]:
#importing the required packages
import pandas as pd 

### Phising Url

In [8]:
# loading the phishing urls in dataframe
import pandas as pd
df1 = pd.read_csv(r'../data/phishing_urls_48K.csv')
df1.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [9]:
# Collecting 10000 phishing urls random from our dataset
phish_url = df1.sample(n=10000, random_state=22).copy()
phish_url = phish_url.reset_index(drop=True)
phish_url.head()

Unnamed: 0,URL,Label
0,center-paypal-resolution-com.cgi-bin.dispatch9...,bad
1,bit.ly/SWeTMu,bad
2,www.ekriswebshop.nl/js/?cgi.ebay.ebay.co.uk?ro...,bad
3,www.itaupersonnakite.com,bad
4,esxcc.com/js/index.htm?us.battle.net/login/en/...,bad


### Legitimate Url

In [10]:
# Loding the legitimate Urls into dataframe
df2 = pd.read_csv(r'../data/top-1m.csv')
df2.head()

Unnamed: 0,index,URL
0,1,google.com
1,2,youtube.com
2,3,facebook.com
3,4,baidu.com
4,5,wikipedia.org


In [13]:
# Collecting 10000 legitimate urls random from our dataset
legit_url =df2.sample(n=10000, random_state=22).copy()
legit_url = legit_url.reset_index(drop=True)
legit_url.head()

Unnamed: 0,index,URL
0,112005,whatismyscreenresolution.com
1,780467,str8guysecrets.tumblr.com
2,731929,amalric.fr
3,914379,keplergrp.com
4,880257,i24.cc


### Feature Extraction:
The extracted features are categorized into

1. Address Bar based Features
2. Domain based Features
3. HTML & Javascript based Features

### Address Bar Based Features:

In [14]:
# importing required packages for this section
from urllib.parse import urlparse, urlencode
import ipaddress
import re

In [15]:
# Domain of the Url (Domain)
def getDomainUrl(url):
    domain = urlparse(url).netloc
    if re.match(r"^www",domain):
        domain = domain.replace("www.", "")
    return domain

In [16]:
# check for having ip add. in url (Have_IP)
def checkIpAdd(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip

In [17]:
# check the presence of @ in url (Have_At)
def haveAtSign(url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

In [25]:
# check for length of url, url greater than 75 charcteris consider as phishing (Url_Length)
def getLength(url):
    if len(url) > 75:
        length = 1
    else:
        length = 0
    return length

In [26]:
# check for url depth (Url_Depth)
def getUrlDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in len(s):
        if len(s[j]) != 0:
            depth = depth + 1
    return depth

In [28]:
# check for url redirection "//" in the url (Redirection)
def redirection(url):
    pos = url.rfind('//')
    if pos > 6 :
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0

In [30]:
# check for existence of 'HTTPS' in domain (https_domain)
def httpsDomain(url):
    domain = urlpase(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

In [31]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [32]:
# check for shorting service (Tiny_Url)
def tinyUrl(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [1]:
# check for prefix and suffix with this '-' symbol (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1
    else:
        return 0

### Domain Based feature extraction

In [18]:
%pip install python-whois

Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
# importing required packages for this section
import socket
import whois
import datetime

In [20]:
# check for dns record 
def has_dns_record(domain):
    try:
        socket.gethostbyname(domain)
        return 1
    except:
        return 0

In [5]:
# check for domain age
def domain_age(domain):
    try:
        w = whois.whois(domain)
        creation_date = w.creation_date
        today = datetime.datetime.now()
        num_days = (today - creation_date).days
        if num_days > 365 :
            return 0
        else:
            return 1
    except:
        return 1

In [8]:
# check for domain expairation date
def domain_exp_date(domain):
    try:
        w = whois.whois(domain)
        exp_date = w.expiration_date
        today = datetime.datetime.now()
        days_left = (exp_date - today)
        if days_left > 183:
            return 0
        else:
            return 1
    except:
        return 1

### HTML and javascript based feature extraction

In [1]:
# importing required package for this 
import requests
from bs4 import BeautifulSoup

In [21]:
# check for hrml iframe
def iframe(response):
    if response == "":
        return 1
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
        iframe_tag = soup.find_all("iframe")
        if not iframe_tag:
            return 0
        for iframe in iframe_tag:
            if not iframe.text.strip():
                return 1
        
    return 0

In [22]:
# check number of time website redirects to another (website forwarding)
def forwarding(response) :
    if response == "":
        return 1
    if len(response.history) <= 2:
        return 0
    else:
        return 1

In [23]:
# check for alter script
def check_alert_script(response):
    if response == "":
        return 1
    soup = BeautifulSoup(response.text, 'html.parser')
    scripts = soup.find_all("script")
    
    for script in scripts:
        if script.string and 'alert(' in script.string:
            return 1
    return 0