In [3]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 加载数据集
phishing_file_path = '/Users/yilu/Downloads/pythonProject/CaptstoneProjectData_2024.csv'
phishing_data = pd.read_csv(phishing_file_path)

normal_file_path = '/Users/yilu/Downloads/pythonProject/emails.csv'
normal_data = pd.read_csv(normal_file_path)

# 填充缺失值
phishing_data['Subject'] = phishing_data['Subject'].fillna('')
phishing_data['Body'] = phishing_data['Body'].fillna('')
normal_data['file'] = normal_data['file'].fillna('')
normal_data['message'] = normal_data['message'].fillna('')

# 简单预处理函数
def simple_preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # 移除HTML标签
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # 移除URL
    text = re.sub(r'\W', ' ', text)  # 移除特殊字符
    text = re.sub(r'\d', ' ', text)  # 移除数字
    text = text.lower()  # 转换为小写
    text = text.replace('________________________________', '')  # 移除连续的下划线
    words = text.split()
    stop_words = {
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
        'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could',
        "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for',
        'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's",
        'her', 'here', "here's", 'hers', 'herself', 'him', "himself", 'his', 'how', "how's", 'I', "I'd", "I'll", "I'm",
        "I've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', 'let', "let's", 'me', 'more', 'most',
        "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our',
        'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
        "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
        'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
        'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
        "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom',
        'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
        'yourself', 'yourselves'
    }
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# 应用预处理函数
phishing_data['Cleaned_Subject'] = phishing_data['Subject'].apply(simple_preprocess_text)
phishing_data['Cleaned_Body'] = phishing_data['Body'].apply(simple_preprocess_text)
normal_data['Cleaned_Subject'] = normal_data['file'].apply(simple_preprocess_text)
normal_data['Cleaned_Body'] = normal_data['message'].apply(simple_preprocess_text)

# 合并清理后的文本
phishing_data['Cleaned_Text'] = phishing_data['Cleaned_Subject'] + " " + phishing_data['Cleaned_Body']
normal_data['Cleaned_Text'] = normal_data['Cleaned_Subject'] + " " + normal_data['Cleaned_Body']

# 添加标签
phishing_data['Label'] = 1
normal_data['Label'] = 0

# 合并两个数据集
all_emails = pd.concat([phishing_data, normal_data], ignore_index=True)

# 初始化Bag of Words向量化器
bow_vectorizer = CountVectorizer(max_features=1000)

# 拟合并转换清理后的文本数据
bow_matrix = bow_vectorizer.fit_transform(all_emails['Cleaned_Text'])

# 将Bag of Words矩阵转换为DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

# 添加标签到DataFrame
bow_df['Label'] = all_emails['Label'].values

# 划分训练集和测试集
X = bow_df.drop('Label', axis=1)
y = bow_df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练Logistic Regression分类器
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# 保存Bag of Words特征到CSV文件
output_file_path = '/Users/yilu/Downloads/pythonProject/bow_features.csv'
bow_df.to_csv(output_file_path, index=False)

print(f"Bag of Words features saved to file: {output_file_path}")

Accuracy: 0.9999903842455479
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    103447
           1       1.00      1.00      1.00       549

    accuracy                           1.00    103996
   macro avg       1.00      1.00      1.00    103996
weighted avg       1.00      1.00      1.00    103996

Bag of Words features saved to file: /Users/yilu/Downloads/pythonProject/bow_features.csv
