# Simple Spam-Filter with MultinomialDB (>98% acc.)

![Spam](https://i.imgur.com/TCU4qIa.png)



# 1. Load the Dataset & Preprocessing

In [None]:
import numpy as np
import pandas as pd 

df = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
df.head(5)

### Convert the messages into a matrix of token counts with CountVectorizer
More information about [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(df.Message)

In [None]:
# Display one vector constructed with a message
X.toarray()[0]

In [None]:
# Get the categories
y = df.Category
y[:5]

In [None]:
# Split arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 2. Hyperparameter selection

In [None]:
# Find the best hyperparameter with GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
parameters = {"alpha": [0.2,1,2,5,10], "fit_prior": [True, False]}

grid = GridSearchCV(MultinomialNB(), param_grid=parameters)
grid.fit(X_train,y_train)

# Create a DataFrame with the best Hyperparameters
pd.DataFrame(grid.cv_results_)[['params','mean_test_score']]\
                               .sort_values(by="mean_test_score", ascending=False)

In [None]:
# Display the best hyperparameters
grid.best_params_

# 3. The MultinomialNB Model

In [None]:
# Create the model with the best hyperparameters
from sklearn.naive_bayes import MultinomialNB
alpha, fit_prior = grid.best_params_['alpha'], grid.best_params_['fit_prior']
model = MultinomialNB(alpha = alpha)

model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score
print(f'Accuracy: {round(accuracy_score(y_test,y_pred),3)}\n')
print(classification_report(y_test,y_pred))

# 4. Examples

In [None]:
df_spam = df[df['Category'] == 'spam']
df_ham = df[df['Category'] == 'ham']

In [None]:
def display_result(df, number=1):
    for i in range(number):
        msg = df['Message'].iloc[i]
        label = df["Category"].iloc[i]
        msg_vec = cv.transform([msg])
        pred_label = model.predict(msg_vec)
        print(f"E-Mail: {msg}\nReal category: {label}\nPredicted category:{pred_label[0]}")
        print("\n")
    
df_spam = df[df['Category'] == 'spam']
display_result(df_spam)
display_result(df_ham)