In [1]:
import pandas as pd
import re
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def get_length_of_url(url):
    return len(str(url))

def is_IP(url):
    p = re.compile('^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$')
    if p.match(url):
        return 1
    else:
        return 0
    
def get_token_list(url):
    return re.split('\W+', url)

def count_token(token_list):
    return len(token_list)

def average_token_length(token_list):
    sum = 0
    for token in token_list:
        sum += len(token)
    return sum / len(token_list)

def count_sensitive_words(tokens_list):
    sen_words = ['confirm', 'account', 'banking', 'secure', 'ebayisapi', 'webscr', 'login', 'signin', 'exe']
    count = 0
    for word in sen_words:
        if word in tokens_list:
            count += 1;
    return count

def count_dots(url):
    return str(url).count('.')

In [2]:
# Train & Predict
data = pd.read_csv('data.csv')

data_urls = data['url']
data_labels = data['label']

labels = []
for i in data_labels:
    if i == "good":
        labels.append(1)
    elif i == "bad":
        labels.append(0)
    else:
        print("label error.")

features = []
for url in data_urls:
    token_list = get_token_list(url)
    features.append([get_length_of_url(url), is_IP(url), 
                     count_token(token_list), average_token_length(token_list), 
                     count_sensitive_words(token_list), count_dots(url)])

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=2019)

# 数据预处理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
    
# 使用决策树
dtree = tree.DecisionTreeClassifier()
dtree.fit(X_train_std, y_train)
print('Done fit')

Done fit


In [3]:
# 计算准确率、召回率
from sklearn.metrics import precision_score, recall_score
y_pred = dtree.predict(X_test_std, y_test)
ACC = precision_score(y_test, y_pred)
REC = recall_score(y_test, y_pred) 
print('Accuracy = %.2f    Recall = %.2f' %(ACC, REC))

Accuracy = 0.86    Recall = 0.99


In [20]:
# 单个测试
# 注：由于数据集中good url里包含的'www.'比较少但是bad url里比较多
# 所以可能会把一些正常URL识别错，例如'www.baidu.com'会识别成bad
# 测试时尝试去掉'www.'

test_URL = 'www.baidu.com'

test_features = []
test_token_list = get_token_list(test_URL)
test_features.append([get_length_of_url(test_URL), 
                      is_IP(test_URL), 
                      count_token(test_token_list), 
                      average_token_length(test_token_list), 
                      count_sensitive_words(test_token_list), 
                      count_dots(test_URL)])
prediction = dtree.predict(test_features)[0]

print('Good URL') if prediction == 1 else print('Bad URL')

Bad URL
