In [8]:
import math
from collections import Counter
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# ML 기반 의심스러운 URL 탐지

# entropy를 이용해 데이터의 불확실성 측정
def entropy(s):
    p, lns = Counter(s), float(len(s))  
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())

In [3]:

# url을 . , - , / 기준으로 tokenization 수행 (com은 너무 많아서 제거) => vector로 변환

def getTokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')  
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')  
        tokensByDot = []
        for j in range(0, len(tokens)):
            tempTokens = str(tokens[j]).split('.')  
            tokensByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))  
    if 'com' in allTokens:
        allTokens.remove('com')  
    return allTokens


In [4]:
import numpy as np
import pandas as pd

urlCsv = pd.read_csv('./dataset/url_dataset.csv')
urlData = pd.DataFrame(urlCsv)

print(urlData.head())

urlData = np.array(urlData)
random.shuffle(urlData)



                      url label
0  diaryofagameaddict.com   bad
1        espdesign.com.au   bad
2      iamagameaddict.com   bad
3           kalantzis.net   bad
4   slightlyoffcenter.net   bad


In [10]:
y = [d[1] for d in urlData]
corpus = [d[0] for d in urlData]
vectorizer = TfidfVectorizer(tokenizer=getTokens, token_pattern=None)
X = vectorizer.fit_transform(corpus)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
lgs = LogisticRegression()
lgs.fit(X_train, y_train)

print(lgs.score(X_test, y_test))

0.9795345629243813


In [13]:
X_predict = [
    'example.com', 
    'secure-bank.com/login',
    'malware-site.ru/download/trojan.exe',
    'shopping-site.com/product?id=12345',
    'phishing-login.net/account_verification',
    'safe-site.org',
    'unknown-source.biz/update.exe',
    'techsupport-fake.com/install-helper.exe',
    'secure-payment.io/checkout',
    'random-site.net/home'
]

X_predict = vectorizer.transform(X_predict)
y_Predict = lgs.predict(X_predict)
print(y_Predict)

['bad' 'bad' 'bad' 'good' 'bad' 'bad' 'bad' 'bad' 'bad' 'bad']
