In [17]:
#处理csv数据集为txt文件以供规则匹配
import csv

# CSV文件路径，请确保替换为您具体的文件路径
csv_file_path = '.\csv\phishing_site_urls.csv'

# 输出TXT文件的路径
bad_txt_path = 'bad_urls.txt'
good_txt_path = 'good_urls.txt'

# 初始化文件写入对象
with open(bad_txt_path, 'w',encoding='utf-8') as bad_file, open(good_txt_path, 'w',encoding='utf-8') as good_file:
    # 读取CSV文件
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            url = row['URL']
            label = row['Label']
            
            # 根据标签写入对应的文件
            if label == 'bad':
                bad_file.write(url + '\n')
            elif label == 'good':
                good_file.write(url + '\n')

print("分离完成，已生成'bad_urls.txt'和'good_urls.txt'")

分离完成，已生成'bad_urls.txt'和'good_urls.txt'


In [4]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

# 计算相似度百分比
def similarity_by_levenshtein(s1, s2):
    distance = levenshtein_distance(s1, s2)
    max_len = max(len(s1), len(s2))
    similarity = (max_len - distance) / max_len
    return similarity

str1 = "pub-ea806534a1b74cd39c327bf44e5f76f4.r2.dev/Dw7jxsWCu3bn9wkDwlm3ntV0jGD4vtzwk93mXiejjS8o0wjG75n93keChru7mI3XwuO92jn5I.htm"
str2 = "pub-ea806534a1b74cd39c327bf44e5f76f4.r2.dev"
print(similarity_by_levenshtein(str1, str2))

0.35537190082644626


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_str(s1, s2):
    vectorizer = CountVectorizer().fit_transform([s1, s2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

str1 = "kimki.ru/emeka/panelnew/gate.php"
str2 = "www.alfalima.it/transactions.php"
print(cosine_similarity_str(str1, str2))

0.18257418583505539


In [10]:
 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
#取出X和y
X = ['kimki.ru/emeka/panelnew/gate.php']
y = [1]
#创建一个TfidfVectorizer的实例
vectorizer = TfidfVectorizer()
#使用Tfidf将文本转化为向量
X = vectorizer.fit_transform(X)
#看看特征形状
X.shape

data1 = {'word': vectorizer.get_feature_names_out(),
        'tfidf': X.toarray().sum(axis=0).tolist()}
df1 = pd.DataFrame(data1).sort_values(by="tfidf" ,ascending=False,ignore_index=True) 
df1.head(10)

Unnamed: 0,word,tfidf
0,emeka,0.408248
1,gate,0.408248
2,kimki,0.408248
3,panelnew,0.408248
4,php,0.408248
5,ru,0.408248


In [1]:
#训练用于识别的机器学习模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['KaiTi']  #指定默认字体 SimHei黑体
plt.rcParams['axes.unicode_minus'] = False   #解决保存图像是负号'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
# 读取数据
data = pd.read_csv('.\csv\phishing_site_urls.csv')  
X = data['URL'].values
y = data['Label'].map({'bad': 0, 'good': 1}).values  # 将标签转换为0和1
#创建一个TfidfVectorizer的实例
vectorizer = TfidfVectorizer()
#使用Tfidf将文本转化为向量
X = vectorizer.fit_transform(X)
#看看特征形状
X.shape

# 保存vectorizer到文件
with open('./model_save/tfidf_vectorizer.pickle', 'wb') as file:
    pickle.dump(vectorizer, file)

In [None]:
data1 = {'word': vectorizer.get_feature_names_out(),
        'tfidf': X.toarray().sum(axis=0).tolist()}
df1 = pd.DataFrame(data1).sort_values(by="tfidf" ,ascending=False,ignore_index=True) 
df1.head(10)

In [12]:
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,stratify=y,random_state = 0)
#可以检查一下划分后数据形状
X_train.shape,X_test.shape, y_train.shape, y_test.shape

((439476, 528563), (109870, 528563), (439476,), (109870,))

In [None]:
#采用十种模型，对比测试集精度
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [18]:

#逻辑回归
model1 =  LogisticRegression(C=1e10,max_iter=10000)
 
#朴素贝叶斯
model2 = MultinomialNB()
 
#K近邻
model3 = KNeighborsClassifier(n_neighbors=50)
 
#决策树
model4 = DecisionTreeClassifier(random_state=77)
 
#随机森林
model5= RandomForestClassifier(n_estimators=500,  max_features='sqrt',random_state=10)
 
#梯度提升
model6 = GradientBoostingClassifier(random_state=123)
 
 
#支持向量机
model9 = SVC(kernel="rbf", random_state=77)
 
#神经网络
model10 = MLPClassifier(hidden_layer_sizes=(16,8), random_state=77, max_iter=10000)
 
model_list=[model1,model2,model3,model4,model5,model6,model9,model10]
model_name=['逻辑回归','朴素贝叶斯','K近邻','决策树','随机森林','梯度提升','支持向量机','神经网络']
scores=[]
for i in range(len(model_list)):
    try:
        model_C=model_list[i]
        name=model_name[i]
        model_C.fit(X_train, y_train)
        s=model_C.score(X_test, y_test)
        scores.append(s)
        print(f'{name}方法在测试集的准确率为{round(s,3)}')
        # 保存模型
        model_path = f"./model_save/{name}.pkl"  # 修改文件扩展名为.pkl
        with open(model_path, 'wb') as file:
            pickle.dump(model_C, file)
        print(f"{name}模型已保存至{model_path}")   
    except Exception as e:
        print(f"{name}模型训练出错：{e}")
        continue


逻辑回归方法在测试集的准确率为0.98
逻辑回归模型已保存至逻辑回归.pkl
朴素贝叶斯方法在测试集的准确率为0.973
朴素贝叶斯模型已保存至朴素贝叶斯.pkl
K近邻模型训练出错：Unable to allocate 266. MiB for an array with shape (69771299,) and data type int32


In [None]:
plt.figure(figsize=(7,3),dpi=128)
sns.barplot(y=model_name,x=scores,orient="h")
plt.xlabel('模型准确率')
plt.ylabel('模型名称')
plt.xticks(fontsize=10,rotation=45)
plt.title("不同模型文本分类准确率对比")
plt.show()

In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

def predict_with_pickle(model_path, input_vector):
    """使用pickle加载模型并进行预测"""
    with open(model_path, 'rb') as file:
        loaded_model = pickle.load(file)
        prediction = loaded_model.predict(input_vector)
        return prediction



#预处理文本
def preprocess_text_to_vector(text):
    # 从文件加载vectorizer
    with open('./model_save/tfidf_vectorizer.pickle', 'rb') as file:
        vectorizer = pickle.load(file)
        #使用Tfidf将文本转化为向量
        X = vectorizer.transform([text])
        return X


# 示例：使用第一个模型（逻辑回归）进行预测
str1 = "haldforsamlingshus.dk/spiritual/DHL/index.php?l=_JeHFUq_VJOXK0QWHtoGYDw_Product-UserID&amp;;;userid"
str2 = "associations2.html"
predicted_labels = predict_with_pickle('./model_save/逻辑回归.pkl', preprocess_text_to_vector(str1))
predicted_labels2 = predict_with_pickle('./model_save/逻辑回归.pkl', preprocess_text_to_vector(str2))
print(f"预测结果1:{predicted_labels}")
print(f"预测结果2:{predicted_labels2}")

预测结果1:[0]
预测结果2:[1]
