Zbiór danych `spam.csv` zawiera przykłady wiadomości e-mail oznaczonych jako `spam` i `ham`. Zastosuj [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), aby wygenerować wektory częstości słów dla każdej wiadomości. Wytrenuj i porównaj kilka modeli klasyfikacyjnych, takich jak `MultinomialNB`, `KNeighborsClassifier`, `LogisticRegression` itp.

In [9]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [10]:
df = pd.read_csv("../data/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
y = df['Category'].map({'ham': 0, 'spam': 1})
print(y.value_counts())

Category
0    4825
1     747
Name: count, dtype: int64


In [12]:
cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(df['Message'])

print("df shape:", df.shape)
print("X shape:", X.shape)

print("Vocabulary size:", len(cv.vocabulary_))

df shape: (5572, 2)
X shape: (5572, 8440)
Vocabulary size: 8440


In [13]:
print(X[:5].toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

In [None]:
# Labels for confusion matrix
# 'ham' (negative) -> index 0, 'spam' (positive) -> index 1
labels_for_cm = [0, 1]
target_names = ['ham', 'spam']

### Multinomial Native Bayes (MultinomialNB)

In [16]:
mnb = MultinomialNB()

mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=labels_for_cm)
tn, fp, fn, tp = cm.ravel()

print("Confusion Matrix:")
print(cm)
print("\nTrue Negatives (ham as ham):", tn)
print("False Positives (ham as spam):", fp)
print("False Negatives (spam as ham):", fn)
print("True Positives (spam as spam:)", tp)

print("#" * 20)
print(classification_report(y_test, y_pred, labels=labels_for_cm, target_names=target_names))

Confusion Matrix:
[[955  11]
 [ 10 139]]

True Negatives (ham as ham): 955
False Positives (ham as spam): 11
False Negatives (spam as ham): 10
True Positives (spam as spam:) 139
####################
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.93      0.93      0.93       149

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### K-Nearest Neighbors (KNN)

In [17]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=labels_for_cm)
tn, fp, fn, tp = cm.ravel()

print("Confusion Matrix:")
print(cm)
print("\nTrue Negatives (ham as ham):", tn)
print("False Positives (ham as spam):", fp)
print("False Negatives (spam as ham):", fn)
print("True Positives (spam as spam:)", tp)

print("#" * 20)
print(classification_report(y_test, y_pred, labels=labels_for_cm, target_names=target_names))

Confusion Matrix:
[[966   0]
 [101  48]]

True Negatives (ham as ham): 966
False Positives (ham as spam): 0
False Negatives (spam as ham): 101
True Positives (spam as spam:) 48
####################
              precision    recall  f1-score   support

         ham       0.91      1.00      0.95       966
        spam       1.00      0.32      0.49       149

    accuracy                           0.91      1115
   macro avg       0.95      0.66      0.72      1115
weighted avg       0.92      0.91      0.89      1115



### Logistic Regression

In [18]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=labels_for_cm)
tn, fp, fn, tp = cm.ravel()

print("Confusion Matrix:")
print(cm)
print("\nTrue Negatives (ham as ham):", tn)
print("False Positives (ham as spam):", fp)
print("False Negatives (spam as ham):", fn)
print("True Positives (spam as spam:)", tp)

print("#" * 20)
print(classification_report(y_test, y_pred, labels=labels_for_cm, target_names=target_names))

Confusion Matrix:
[[966   0]
 [ 21 128]]

True Negatives (ham as ham): 966
False Positives (ham as spam): 0
False Negatives (spam as ham): 21
True Positives (spam as spam:) 128
####################
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

