# Malicious URL Detection - TF_IDF + SVM

## 导入所需包

In [15]:
import pickle

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

## 编写数据清洗方法

In [16]:
def csv_data_read(csv_file_path):
    # 为减少训练时间，可只取头部10W条，但一定需要先打乱样本）
    df_csv = pd.read_csv(csv_file_path).head(10000)
    # df_csv = pd.read_csv(csv_file_path)
    urls = []
    labels = []
    for index, row in df_csv.iterrows():
        urls.append(row["url"])
        labels.append(row["label"])
    return urls, labels


def url_tokenize(url):
    """
    对URL进行清洗，删除斜线、点、和com，进行分词
    :param url:
    :return:
    """
    web_url = url.lower()
    dot_slash = []
    slash = str(web_url).split('/')
    for i in slash:
        r1 = str(i).split('-')
        token_slash = []
        for j in range(0,len(r1)):
            r2 = str(r1[j]).split('.')
            token_slash = token_slash + r2
        dot_slash = dot_slash + r1 + token_slash
    urltoken_list = list(set(dot_slash))
    white_words = ["com", "http:", "https:", ""]
    for white_word in white_words:
        if white_word in urltoken_list:
            urltoken_list.remove(white_word)
    return urltoken_list

## 编写特征提取方法

In [17]:
def feature_extract(grey_urls, y):
    """
    使用TF-IDF算法提取关键词特征，并将数据帧划分为训练集和测试集
    """
    url_vectorizer = TfidfVectorizer(tokenizer=url_tokenize)
    x = url_vectorizer.fit_transform(grey_urls)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    return x_train, x_test, y_train, y_test


## 编写模型引入方法

In [18]:
def dump_model_object(file_path, model_object):
    """
    使用pickle将内存中的对象转换为文本流保存为本地文件
    :param file_path:
    :return:
    """
    with open(file_path, "wb") as f:
        pickle.dump(model_object, f)
    f.close()


def practice_svm(x_train, x_test, y_train, y_test):
    """
    实践SVM算法识别恶意URL
    :param x_train:
    :param x_test:
    :param y_train:
    :param y_test:
    :return:
    """
    model_svm = SVC()
    # 注意：SVM训练可能较慢，注意样本的数量
    model_svm.fit(x_train, y_train)
    svm_score = model_svm.score(x_test, y_test)
    print("测试拟合分数为：{0}".format(svm_score))
    model_svm_save = model_svm

    """
    保存训练好的模型和向量
    """
    file_mode = "../../model/model-0x3/model_svm.pkl"
    dump_model_object(file_mode, model_svm_save)

## 开始模型训练（主函数）

In [19]:
"""
加载数据集
"""
grep_csv_file_path = "../../data/data-0x3/grey-url.csv"
black_csv_file_path = "../../data/data-0x3/black-url.csv"
grey_urls, y = csv_data_read(grep_csv_file_path)

"""
提取特征
"""
x_train, x_test, y_train, y_test = feature_extract(grey_urls, y)

"""
使用支持向量机（SVM）算法识别恶意URL
"""
practice_svm(x_train, x_test, y_train, y_test)

测试拟合分数为：0.897


## 模型评估

从上个Cell的模型训练结果中，我们得出拟合分数为0.897，即对这部分数据使用SVM模型