In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
spambase_path = "spambase.csv"  # Correct path
spambase_cols = [f"feature_{i}" for i in range(57)] + ["label"]

spambase_df = pd.read_csv(spambase_path, header=None)
spambase_df.columns = spambase_cols  # Assign column names


In [3]:
import gensim.downloader as api

# Load pre-trained GloVe model
glove = api.load("glove-wiki-gigaword-100")  # This loads a 100-dimensional GloVe model



In [4]:
pip install pybloom-live

Collecting pybloom-live
  Downloading pybloom_live-4.0.0.tar.gz (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting bitarray>=0.3.4 (from pybloom-live)
  Downloading bitarray-3.0.0-cp312-cp312-win_amd64.whl.metadata (33 kB)
Collecting xxhash>=3.0.0 (from pybloom-live)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading bitarray-3.0.0-cp312-cp312-win_amd64.whl (121 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Building wheels for collected packages: pybloom-live
  Building wheel for pybloom-live (setup.py): started
  Building wheel for pybloom-live (setup.py): finished with status 'done'
  Created wheel for pybloom-live: filename=pybloom_live-4.0.0-py3-none-any.whl size=9304 sha256=9abdf768b84a92b6275dbb9112d007f6f2a334675156ea5b2451c805424cf673
  Stored in directory: c:\users\lenovo\appdata\local\pip\cache\wheels\1d\5e\b8\19c04c108b0acd2ca53b85fcc09eeed323ecdfea0e36bd134e


[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from pybloom_live import BloomFilter

# Define the Bloom Filter (size and error rate)
bloom = BloomFilter(capacity=1000, error_rate=0.1)

# Example: Adding spam words/features to the Bloom Filter
for feature in spambase_df["feature_0"]:  # You can iterate over relevant features
    bloom.add(str(feature))  # Add features to the Bloom filter


In [6]:
print(spambase_df.columns)


Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49',
       'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54',
       'feature_55', 'feature_56', 'label'],
      dtype='object')


In [7]:
# Initialize counters for true positives and false positives
true_positive = 0
false_positive = 0
false_negative = 0
true_negative = 0

# Iterate over the dataset and apply the Bloom filter
for _, row in spambase_df.iterrows():
    feature = str(row["feature_0"])  # Assuming feature_0 as an example
    
    # If the Bloom filter indicates the feature is spam
    if bloom.__contains__(feature):
        # Check if the email is actually spam (label == 1)
        if row["label"] == 1:
            true_positive += 1  # Correctly identified spam
        else:
            false_positive += 1  # Legitimate email incorrectly identified as spam
    else:
        # If the Bloom filter indicates the feature is not spam
        if row["label"] == 0:
            true_negative += 1  # Correctly identified legitimate email
        else:
            false_negative += 1  # Spam email missed by the Bloom filter

# Calculate True Positive Rate and False Positive Rate
TPR = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
FPR = false_positive / (false_positive + true_negative) if (false_positive + true_negative) != 0 else 0

print(f"True Positive Rate: {TPR:.2f}")
print(f"False Positive Rate: {FPR:.2f}")


True Positive Rate: 0.00
False Positive Rate: 1.00
