In [2]:
import re
from urllib.parse import urlparse
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import whois
from datetime import datetime

-1 - Malicious
0 - suspicious
1 - legitimate

In [3]:
def get_scheme(url):
    return urlparse(url).scheme

def get_domain(url):
    return urlparse(url).netloc

def get_host_name(url):
    return urlparse(url).hostname

def get_port(url):
    return urlparse(url).port

def get_path(url):
    return urlparse(url).path

def get_domain_details(url):
    try:
        return whois.whois(get_domain(url))
    except Exception:
        return None

### Lexical features

In [4]:
def has_ip(url):
    ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b|\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
    return 1 if re.findall(ip_pattern, url) else 0

In [5]:
def has_at(url):
    return 1 if '@' in url else 0

In [6]:
def has_redirection(url):
    return 1 if '//' in get_path(url) else 0

In [7]:
def has_hypen(url):
    return 1 if '_' in url else 0

In [8]:
def shortining_service(url):
    services = ('bit.ly', 'goo.gl', 'shorte.st', 'go2l.ink', 'x.co', 'ow.ly', 't.co', 'tinyurl', 'tr.im', 'is.gd', 'cli.gs', ''
                    'yfrog.com', 'migre.me', 'ff.im', 'tiny.cc', 'url4.eu', 'twit.ac', 'su.pr', 'twurl.nl', 'snipurl.com', ''
                    'short.to', 'BudURL.com', 'ping.fm', 'post.ly', 'Just.as', 'bkite.com', 'snipr.com', 'fic.kr', 'loopt.us', ''
                    'doiop.com', 'short.ie', 'kl.am', 'wp.me', 'rubyurl.com', 'om.ly', 'to.ly', 'bit.do', 't.co', 'lnkd.in', ''
                    'db.tt', 'qr.ae', 'adf.ly', 'goo.gl', 'bitly.com', 'cur.lv', 'tinyurl.com', 'ow.ly', 'bit.ly', 'ity.im', ''
                    'q.gs', 'is.gd', 'po.st', 'bc.vc', 'twitthis.com', 'u.to', 'j.mp', 'buzurl.com', 'cutt.us', 'u.bb', 'yourls.org', ''
                    'x.co', 'prettylinkpro.com', 'scrnch.me', 'filoops.info', 'vzturl.com', 'qr.net', '1url.com', 'tweez.me', 'v.gd', 'tr.im', 'link.zip.net')
    
    is_service_exsist = [True if service in url else False for service in services]
    return 1 if any(is_service_exsist) else 0

In [9]:
def has_non_standard_ports(url):
    standard_ports = [21, 22, 23, 80, 443, 445, 1433, 1521, 3306, 3389]
    return 1 if get_port(url) not in standard_ports else 0

In [10]:
def has_suffix_prefix(url):
    return 1 if '-' in get_host_name(url) else 0

In [11]:
def has_subdomains(url):
    domain = get_host_name(url)
    if (not has_ip(url) and domain.count('.') > 2):
        return 1
    if domain.count('.') == 2:
        return 0
    return -1

In [12]:
def is_https(url):
    return 1 if 'https' == get_scheme(url) else 0

In [13]:
def url_length_long(url):
    if len(url) > 70:
        return 1
    if len(url) < 70 and len(url) >=51:
        return 0
    return -1

In [14]:
def is_active(url):
    try:
        response = requests.get(url)
        return 1 if response.status_code == 200 else 0
    except Exception as error:
        print(f"While processing {url=} got the {error=}")
        return 0
    # return 1 if state.strip() == 'Active' else 0

#### Dynamic features

In [15]:
def web_traffic(url):
    try:
        rank = BeautifulSoup(urllib.request.urlopen(f"http://data.alexa.com/data?cli=10&dat=s&{url=}").read(), "html.parser").find("REACH")['RANK']
        rank= int(rank)
        if not (rank<100000):
            return -1
        return 0
    except HTTPError:
        return -1
    except Exception:
        return 1

In [16]:
def has_dns_record(url):
    dns_detail = get_domain_details(url)
    return 0 if dns_detail else 1

In [17]:
def is_sixmonth_old_dns(url):
    dns_details = get_domain_details(url)
    if not dns_details:
        return 1
    created_time = dns_details.creation_date
    expiry_date = dns_details.expiration_date

    if not created_time or not expiry_date:
        return 1
    
    if not isinstance(created_time, str) or not isinstance(expiry_date, str) or isinstance(created_time, list) or isinstance(expiry_date, list):
        return -1
    
    created_time = datetime.strptime(created_time, '%Y-%m-%d')
    expiry_date = datetime.strptime(expiry_date, '%Y-%m-%d')
    age = abs(expiry_date - created_time).days / 30
    return 1 if age < 6 else 0

In [32]:
def gen_features(df):
    funcs = [has_ip, has_at, has_hypen, has_redirection, has_non_standard_ports, has_subdomains, has_suffix_prefix, is_https, url_length_long, shortining_service, 
             is_active, web_traffic, has_dns_record, is_sixmonth_old_dns]
    for func in funcs:
        df[func.__name__] = df.url.apply(func)
    return df

### Generate features

In [33]:
phish_df = pd.read_csv('../Data collection/phish_urls.csv')
phish_df['is_phish'] = [1 for i in range(phish_df.shape[0])]
phish_df.drop(['phish_id_url', 'active_state'], axis=1, inplace=True)
phish_df.dropna(inplace=True)
phish_df.head()

Unnamed: 0,url,is_phish
0,https://nghereviewer.com/,1
1,http://ctt.entregapt.com,1
2,https://spk-neu-app.com/de/sp,1
3,https://www.autoatendimentosafra.shop/atendime...,1
4,https://autoatendimentosafra.shop/atendimento....,1


In [34]:
phish_df = gen_features(phish_df.iloc[:20, :])
phish_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

While processing url='http://ctt.entregapt.com' got the error=ConnectionError(MaxRetryError("HTTPConnectionPool(host='ctt.entregapt.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000029F09216FB0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))"))
While processing url='https://spk-neu-app.com/de/sp' got the error=ConnectionError(MaxRetryError("HTTPSConnectionPool(host='spk-neu-app.com', port=443): Max retries exceeded with url: /de/sp (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000029F09216FB0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))"))
While processing url='https://www.autoatendimentosafra.shop/atendimento.html' got the error=ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.autoatendimentosafra.shop', port=443): Max retries exceeded with url: /atendimento.html (Caused by NewConnectionError('<urllib

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Unnamed: 0,url,is_phish,has_ip,has_at,has_hypen,has_redirection,has_non_standard_ports,has_subdomains,has_suffix_prefix,is_https,url_length_long,shortining_service,is_active,web_traffic,has_dns_record,is_sixmonth_old_dns
0,https://nghereviewer.com/,1,0,0,0,0,1,-1,0,1,-1,0,1,1,0,-1
1,http://ctt.entregapt.com,1,0,0,0,0,1,0,0,0,-1,1,0,1,0,-1
2,https://spk-neu-app.com/de/sp,1,0,0,0,0,1,-1,1,1,-1,0,0,1,0,-1
3,https://www.autoatendimentosafra.shop/atendime...,1,0,0,0,0,1,0,0,1,0,0,0,1,0,-1
4,https://autoatendimentosafra.shop/atendimento....,1,0,0,0,0,1,-1,0,1,-1,0,0,1,0,-1


In [35]:
legitimate_df = pd.read_csv('../Data collection/legitimate_url.csv')
legitimate_df['is_phish'] = [0 for i in range(legitimate_df.shape[0])]
legitimate_df.head()

Unnamed: 0,url,is_phish
0,https://maps.google.com/maps?q=sports&num=1000...,0
1,https://sports.yahoo.com/&sa=U&ved=2ahUKEwjnla...,0
2,https://sports.yahoo.com/nfl/&sa=U&ved=2ahUKEw...,0
3,https://sports.yahoo.com/mlb/&sa=U&ved=2ahUKEw...,0
4,https://sports.yahoo.com/nba/&sa=U&ved=2ahUKEw...,0


In [40]:
legitimate_df = gen_features(legitimate_df.iloc[:20, :])
legitimate_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[func.__name__] = df.url.apply(func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Unnamed: 0,url,is_phish,has_ip,has_at,has_hypen,has_redirection,has_non_standard_ports,has_subdomains,has_suffix_prefix,is_https,url_length_long,shortining_service,is_active,web_traffic,has_dns_record,is_sixmonth_old_dns
0,https://maps.google.com/maps?q=sports&num=1000...,0,0,0,1,0,1,0,0,1,1,0,1,1,0,-1
1,https://sports.yahoo.com/&sa=U&ved=2ahUKEwjnla...,0,0,0,0,0,1,0,0,1,1,0,1,1,0,-1
2,https://sports.yahoo.com/nfl/&sa=U&ved=2ahUKEw...,0,0,0,0,0,1,0,0,1,1,0,1,1,0,-1
3,https://sports.yahoo.com/mlb/&sa=U&ved=2ahUKEw...,0,0,0,1,0,1,0,0,1,1,0,1,1,0,-1
4,https://sports.yahoo.com/nba/&sa=U&ved=2ahUKEw...,0,0,0,0,0,1,0,0,1,1,0,1,1,0,-1


In [43]:
phish_df.to_csv('phish_urls_with_feature_20.csv', index=False)
legitimate_df.to_csv('legitimate_urls_with_feature_20.csv', index=False)

In [44]:
df = pd.concat([legitimate_df, phish_df])
df.to_csv('full_data_with_feature_40.csv', index=False)