import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load phishing emails and normal emails data
phishing_file_path = '/Users/zhangguoyu/Downloads/CaptstoneProjectData_2024.csv'
phishing_data = pd.read_csv(phishing_file_path)

normal_file_path = '/Users/zhangguoyu/Downloads/emails.csv'
normal_data = pd.read_csv(normal_file_path)

# Fill missing values
phishing_data['Subject'] = phishing_data['Subject'].fillna('')
phishing_data['Body'] = phishing_data['Body'].fillna('')
normal_data['file'] = normal_data['file'].fillna('')
normal_data['message'] = normal_data['message'].fillna('')

# Simple text preprocessing function
def simple_preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    text = text.replace('________________________________', '')
    words = text.split()
    stop_words = {
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
        'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could',
        "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for',
        'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's",
        'her', 'here', "here's", 'hers', 'herself', 'him', "himself", 'his', 'how', "how's", 'I', "I'd", "I'll", "I'm",
        "I've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', 'let', "let's", 'me', 'more', 'most',
        "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our',
        'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
        "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
        'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
        'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
        "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom',
        'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
        'yourself', 'yourselves'
    }
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing function
phishing_data['Cleaned_Subject'] = phishing_data['Subject'].apply(simple_preprocess_text)
phishing_data['Cleaned_Body'] = phishing_data['Body'].apply(simple_preprocess_text)
normal_data['Cleaned_Subject'] = normal_data['file'].apply(simple_preprocess_text)
normal_data['Cleaned_Body'] = normal_data['message'].apply(simple_preprocess_text)

# Combine cleaned subject and body
phishing_data['Cleaned_Text'] = phishing_data['Cleaned_Subject'] + " " + phishing_data['Cleaned_Body']
normal_data['Cleaned_Text'] = normal_data['Cleaned_Subject'] + " " + normal_data['Cleaned_Body']

# Add labels
phishing_data['Label'] = 1
normal_data['Label'] = 0

# Combine datasets
all_emails = pd.concat([phishing_data, normal_data], ignore_index=True)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Extract TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(all_emails['Cleaned_Text'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df['Label'] = all_emails['Label'].values

# Split the dataset
X = tfidf_df.drop('Label', axis=1)
y = tfidf_df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Save TF-IDF features to CSV file
output_file_path = '/Users/zhangguoyu/Downloads/tfidf_features.csv'
tfidf_df.to_csv(output_file_path, index=False)

print(f"TF-IDF features saved to file: {output_file_path}")

In [None]:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    103447
           1       1.00      1.00      1.00       549

    accuracy                           1.00    103996
   macro avg       1.00      1.00      1.00    103996
weighted avg       1.00      1.00      1.00    103996