In [1]:
# Daten aus Kaggle laden
%pip install kagglehub

import os
import kagglehub
import pandas as pd

download_path = kagglehub.dataset_download("gokulraja84/emails-dataset-for-spam-detection")
csv_file_path = os.path.join(download_path, 'Emails.csv')
raw_df = pd.read_csv(csv_file_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/gokulraja84/emails-dataset-for-spam-detection?dataset_version_number=1...


100%|██████████| 2.86M/2.86M [00:00<00:00, 23.0MB/s]

Extracting files...





In [2]:
# doppelte Daten entfernen
raw_unique_df = raw_df.drop_duplicates(subset=['text'], keep='first').copy()
print(f"Größe raw_df: {len(raw_df)} Zeilen")
print(f"Größe raw_unique_df: {len(raw_unique_df)} Zeilen")

Größe raw_df: 5728 Zeilen
Größe raw_unique_df: 5695 Zeilen


In [3]:
# Konvertierung der label zu 0 und 1
all_df = raw_unique_df.copy()
all_df['label'] = (all_df['spam']).astype(int)
all_df = all_df.drop('spam', axis=1)
display(all_df)

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [4]:
# Aufteilung in train_df und validation_df
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
SHARE_TEST = 0.2
train_df, validation_df = train_test_split(
    all_df,
    test_size=SHARE_TEST,    # 20% für validation_df
    random_state=RANDOM_SEED,  # Random-Seed für Reproduzierbarkeit
    stratify=all_df['label'] # für eine gleiche Verteilung
)
train_df.reset_index(drop=True, inplace=True) #lässt  Index wieder von 1 starten
validation_df.reset_index(drop=True, inplace=True)
print(f"Größe von train_df: {len(train_df)} Zeilen")
print(f"Größe von validation_df: {len(validation_df)} Zeilen")

Größe von train_df: 4556 Zeilen
Größe von validation_df: 1139 Zeilen


In [5]:
# Trainingsdaten und Validationdaten ausgeben
print('Trainingsdaten')
display(train_df)
print('Validationdaten')
display(validation_df)

Trainingsdaten


Unnamed: 0,text,label
0,"Subject: energy book hi grant , hope all is ...",0
1,"Subject: re : enron contact info christie , ...",0
2,Subject: improved process for engaging tempora...,0
3,Subject: . jif .,1
4,"Subject: parameter estimation vince , i have...",0
...,...,...
4551,"Subject: a paper of mine vince , i have writ...",0
4552,Subject: re : aiesec polska - eurolds 2000 dr...,0
4553,"Subject: localized software , all languages av...",1
4554,"Subject: happy thanksgiving ! hello , vince !...",0


Validationdaten


Unnamed: 0,text,label
0,"Subject: bachelier finance society congress , ...",0
1,"Subject: congratulations vince , congratulat...",0
2,"Subject: re : resume , thanks a lot , and ju...",0
3,"Subject: re : recommendation letter vincent ,...",0
4,Subject: pac enrollment last year the enron p...,0
...,...,...
1134,Subject: re : managing energy price risk - 2 n...,0
1135,Subject: re : meeting on the 20 th of march f...,0
1136,Subject: faculty information sheet mr . kamin...,0
1137,Subject: selling travel in today ' s economy ...,1


In [6]:
# 1. Verfahren
# BAG OF WORDS MIT NAIVE BAYERS
# hierbei wurde das nicht-gewichtete Bow verfahren für die Codierung benutzt
# daraufhin wurde das Multinomiale Naive Bayes für das Modell benutzt
# zum Schluss wurde MCC auf die Train- und Validation Daten angewendet

# Imports
import sklearn

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

In [7]:
# Daten vorbereiten
train_df_copy = train_df.copy()
validation_df_copy = validation_df.copy()

train_texts = train_df_copy["text"].astype(str)
train_labels = train_df_copy["label"].astype(int)

validation_texts = validation_df_copy["text"].astype(str)
validation_labels = validation_df_copy["label"].astype(int)

In [8]:
# token pattern für Bow
# Wörter mit >2 Buchstaben, Zahlen werden rausgelassen
token_pattern_ohne_zahlen = r"(?u)\b[a-zA-Z]{2,}\b"
token_pattern= token_pattern_ohne_zahlen

# vectorizer für bow
vectorizer = CountVectorizer(
lowercase = True,
stop_words = "english",
ngram_range = (1,2),
token_pattern=token_pattern
)

In [9]:
# test- und validationset mit vectorizer umwandeln
train_text_bow = vectorizer.fit_transform(train_texts)
validation_text_bow = vectorizer.transform(validation_texts)


In [10]:
# modell Naive bayers
model_naive_bayes = MultinomialNB(alpha = 1.0)

# modell trainieren
model_naive_bayes.fit(train_text_bow, train_labels)

# vorhersagen des Modelles auf Train- und Validaiondaten
prediction_train_text_bow = model_naive_bayes.predict(train_text_bow)
prediction_validation_text_bow = model_naive_bayes.predict(validation_text_bow)

In [13]:
# MCC
mcc_train_1 = matthews_corrcoef(train_labels, prediction_train_text_bow)
mcc_validation_1 = matthews_corrcoef(validation_labels, prediction_validation_text_bow)

print("Bag of Words mit Naive Bayers: ")
print(f"Matthew's Correlation Coefficient (MCC) für train: {mcc_train_1:.4f}")
print(f"Matthew's Correlation Coefficient (MCC) für validation: {mcc_validation_1:.4f}")

Bag of Words mit Naive Bayers: 
Matthew's Correlation Coefficient (MCC) für train: 0.9994
Matthew's Correlation Coefficient (MCC) für validation: 0.9735


In [14]:
# 2. Verfahren
# SENTENCE-TRANSFORMER MIT LOGITISCHE REGRESSION
# Die Trainingdaten wurden mit einem vortrainierten SentenceTranformer
# in semantische Emnbeddings umgewandet
# für die binäre Klassifikation wurde die logitische Regression verwendet, die
# das One-vs-Rest schema benutzt hat

# imports
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier



In [15]:
# Daten vorbereiten
train_df_copy = train_df.copy()
validation_df_copy = validation_df.copy()

# Daten vorbereiten ( SentenceTransformer erwartet Liste)
train_texts = train_df_copy["text"].astype(str).tolist()
train_labels = train_df_copy["label"].astype(int).values

validation_texts = validation_df_copy["text"].astype(str).tolist()
validation_labels = validation_df_copy["label"].astype(int).values

In [16]:
# sentence transofmer Modell erstellen, es wird ein vorgefertigtes Modell benutzt
MODEL_NAME = 'all-MiniLM-L6-v2'
model_sentence_transformer = SentenceTransformer(MODEL_NAME)

# text in semantischen Vektor umwandeln
train_texts_sem = model_sentence_transformer.encode(train_texts, show_progress_bar=True)
validation_texts_sem = model_sentence_transformer.encode(validation_texts, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/143 [00:00<?, ?it/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [17]:
# Klaissifikation mit logitische Regression
MAX_ITERATIONS = 1000
classifier = OneVsRestClassifier(LogisticRegression(max_iter=MAX_ITERATIONS))
classifier.fit(train_texts_sem, train_labels)

# vorhersagen des Modelles auf Train- und Validaiondaten
prediction_train_text_sem = classifier.predict(train_texts_sem)
prediction_validation_text_sem = classifier.predict(validation_texts_sem)

In [18]:
# MCC
mcc_train_2 = matthews_corrcoef(train_labels, prediction_train_text_sem)
mcc_validation_2 = matthews_corrcoef(validation_labels, prediction_validation_text_sem)

print("Sentence transformer mit logitischer Regression")
print(f"Matthew's Correlation Coefficient (MCC) für train: {mcc_train_2:.4f}")
print(f"Matthew's Correlation Coefficient (MCC) für validation: {mcc_validation_2:.4f}")

Sentence transformer mit logitischer Regression
Matthew's Correlation Coefficient (MCC) für train: 0.9632
Matthew's Correlation Coefficient (MCC) für validation: 0.9106


In [19]:
# erneute aufgabe des MCC wertes zum vergleichen
print("Bag of Words mit Naive Bayers: ")
print(f"Matthew's Correlation Coefficient (MCC) für train: {mcc_train_1:.4f}")
print(f"Matthew's Correlation Coefficient (MCC) für validation: {mcc_validation_1:.4f}")
print("Sentence transformer mit logitischer Regression")
print(f"Matthew's Correlation Coefficient (MCC) für train: {mcc_train_2:.4f}")
print(f"Matthew's Correlation Coefficient (MCC) für validation: {mcc_validation_2:.4f}")

Bag of Words mit Naive Bayers: 
Matthew's Correlation Coefficient (MCC) für train: 0.9994
Matthew's Correlation Coefficient (MCC) für validation: 0.9735
Sentence transformer mit logitischer Regression
Matthew's Correlation Coefficient (MCC) für train: 0.9632
Matthew's Correlation Coefficient (MCC) für validation: 0.9106
