# Malicious URL detection based on lexical features

    There are several types of features for a URL, BlackList, Lexical, Host-based, Content-based, etc. among which the most efficient and ecnomic way is to generate Lexical features. And thus, lexical features are what we concentrate on in this page.

## Extracting Lexical Features
    Traditional Lexical Features include statistical properties of the URL string, like the length of URL, length of each components of the URL (Hostname, Top Level Domain, Primary domain, etc.), the number of special characters, etc. Also, we can construct directory based on each segment delimited by a special character(e.g. "/", ".", "?", "=", etc.), so-called bag-of-words model. And, in order to detect algorithmically generated malicious URLs, character level features are required, the most common way is to calculate the alpha-numeric distribution, like KL-divergence, Jaccard Coefficient, and Edit-distance using unigram and bigram distributions of characters.
    Heuritic Features come up with the objective of being obfuscation resistant, which can be generally classified as five categories: URL-related features (keywords, length, etc.), Domain features (length of domain name, whether IP address is used as a domain name or not, etc.), Directory related features (length of directory, number of subdirectory tokens, etc.), File name features(length of filename,nubmer of delimiters, etc.), and Arguement Features (length of arguements, number of variables, etc.).

In [1]:
#!/usr/bin/env python
# encoding: utf-8 """
import urllib
import string
from urllib.parse import urlparse

################## 辅助URL处理 #################
def normalize(url):
    return '//' + url if '//' not in url else url


#获取自//之后的内容
def removeURLHeader(url):
    return url.split('//', 1)[-1]

################## URL统计特征 #################

#获取URL长度
def getLength(url):
    return len(url)

#是否具有@符合
def aite(url):
    return 1 if '@' in url else 0

#含数字个数
def getDigitsCount(url):
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in string.digits:
            count += 1

    return count

#大写字母个数
def getCountUpcase(url):
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in string.ascii_uppercase:
            count += 1

    return count

#前缀个数
def getPrefixCount(url):
    prefix = ['_', '-']
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in prefix:
            count += 1

    return count

#URL中数字-字符转换频次
def ZhuanHuanPingci(url):
    urlwhole = removeURLHeader(url)
    count = 0
    length = len(urlwhole)
    for i in range(length):
        if urlwhole[i] in string.digits and i + 1 < length and (
                urlwhole[i + 1] in string.ascii_lowercase or urlwhole[i + 1] in string.ascii_uppercase):

            count += 1
        else:
            if (urlwhole[i] in string.ascii_lowercase or urlwhole[i] in string.ascii_uppercase) and i + 1 < length and urlwhole[
                i + 1] in string.digits:
                count += 1

    return count



################## URL启发式特征 #################

##### 辅助解析 #####
#文件名
def getFilename(url):
    startpos = url.find('/') + 1
    url = url[startpos:]

    return url

#二级域名
def getUrlSubDomain(url):
    host = hostname(url)
    subDomain = '.'.join(host.split('.')[-2:])

    return subDomain

#提取主机名
def hostname(url):
    url = normalize(url)
    parsed_result = urlparse(url)
    a = parsed_result.netloc

    return(a)


#获取目录路径
def getDirectory(url):
    url = removeURLHeader(url)
    if '/' in url:

        startpos = url.index('/') + 1
        endpos = url.rindex('/')
        suburl = url[startpos:endpos]
        return suburl
    else:
        return ""


############# 启发式特征提取 #############


### URL-related Features ###

#是否含有敏感词
def sensitiveword(url):
    i = 0
    flag = 0
    sensitive = ['secure', 'account', 'webscr', 'login', 'ebayiaphi', 'signin', 'banking', 'confirm']
    for i in range(len(sensitive)):
        if sensitive[i] in url:
            flag = 1
            break
        else:
            flag = 0
            continue
    return flag

#是否含有关键词
def targetword(url):
    url = getUrlSubDomain(url)
    normalDomain = ['paypal.com', 'aol.com', 'qq.com', 'made-in-china.com', 'google.com', 'facebook.com', 'yahoo.com',
                    'live.com', 'dropbox.com', 'wellsfargo.com', 'cmr.no', 'academia.edu', 'regions.com',
                    'shrinkthislink.com', 'maximumasp.com', 'popularenlinea.com', 'readydecks.com', 'meezanbank.com',
                    'vencorex.com', 'ketthealth.com', 'obhrmanager.com', 'bluehost.com', 'msubillings.edu',
                    'genxgame.com', 'gripeezoffer.com', 'bek-intern.de', 'ebay.com', 'chase.com', 'revoluza.com',
                    'dhl.com', 'flexispy.com', 'att.com', 'uwsp.edu', 'match.com', 'alnoorhospital.com', 'ourtime.com']
    if url in normalDomain:
        return 1
    else:
        return 0



### Domain-related Features ###

#是否含有错误端口
def mistakePort(url):
    return 1 if ':' in hostname(url) else 0

#主机名项数
def termcout(url):
    url = hostname(url)
    url = url.split('.')
    return len(url)



### Path-related Features ###

#是否路径中含有域名
def pathhasdomin(url):
    url = normalize(url)
    res = urlparse(url)
    return 1 if res.netloc in res.path else 0

### Directory-related Features ###

#是(1)否(0)
def brandname(url):#商标名
    i = 0
    flag = 0
    url = getDirectory(url)
    brandnamelist = ['53.com', 'Chase', 'Microsoft', 'ANZ', 'Citibank', 'Paypal', 'AOL', 'eBay', 'USBank', 'Banamex',
                     'E-Gold', 'Visa', 'Bankofamerica', 'Google', 'Warcraft', 'Barclays', 'HSBC', 'Westpac',
                     'battle.net', 'LIoyds', 'Yahoo']
    for i in range(len(brandnamelist)):
        if brandnamelist[i] in url:
            flag = 1
            break
        else:
            flag = 0
            continue
    return flag



## Load Data And Extract Features

In [2]:
import numpy as np
import pandas as pd

def characterize(url):
    return [getLength(url),aite(url), pathhasdomin(url), len(hostname(url)), sensitiveword(url), mistakePort(url), brandname(url), getDigitsCount(url), getCountUpcase(url), getPrefixCount(url), termcout(url), ZhuanHuanPingci(url), targetword(url)]


raw_data = pd.read_csv('data.csv', header=0)  # 读取csv数据，并将第一行视为表头，返回DataFrame类型
data = raw_data.values
urls = list(data[::, 0])
labels = data[::, 1]


In [3]:
labels = np.where(labels == 'bad', 1, 0)
features  = []
for url in urls:
    features.append(characterize(url))

## Training And Testing

In [5]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import time

# 选取33%数据作为测试集，剩余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)

time_2=time.time()
print('Start training...')
clf = RandomForestClassifier()  # svm class   
clf.fit(train_features, train_labels)  # training the svc model 
time_3 = time.time()
print('training cost %f seconds' % (time_3 - time_2))

print('Start predicting...')
test_predict=clf.predict(test_features)
time_4 = time.time()
print('predicting cost %f seconds' % (time_4 - time_3))

score = accuracy_score(test_labels, test_predict)
score_rec = recall_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
print("The recall score is %f" % score_rec)

Start training...
training cost 3.815389 seconds
Start predicting...
predicting cost 0.681565 seconds
The accruacy score is 0.906518
The recall score is 0.627397


## Predicting Test

In [6]:
def pop_up_box():
    """
    使用tkinter弹出输入框输入url, 输出
    """

    import tkinter

    
    def inputint():
        nonlocal test_url
        test_url = var.get().strip()
        # print(test_url)
        test = []
        test.append(characterize(test_url))
        if clf.predict(test)[0]:
            l.config(text="malicious")
        else:
            l.config(text="benign")
        var.set('')
        test_url = ''

    test_url = 0
    root = tkinter.Tk(className='Here is some interpretation')  # 弹出框框名
    root.geometry('270x60')     # 设置弹出框的大小 w x h

    var = tkinter.StringVar()   # 这即是输入框中的内容
    var.set('') # 通过var.get()/var.set() 来 获取/设置var的值
    entry1 = tkinter.Entry(root, textvariable=var)  # 设置"文本变量"为var
    entry1.pack()   # 将entry"打上去"
    l =tkinter.Label(root, bg = 'yellow', width = 20, text = '')
    l.pack()
    btn1 = tkinter.Button(root, text='Input', command=inputint)     # 按下此按钮(Input), 触发inputint函数

    # 按钮定位
    btn1.pack(side='right')

    # 上述完成之后, 开始真正弹出弹出框
    root.mainloop()

In [7]:
pop_up_box()