# Spam Classifier

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing Dataset

In [None]:
dataset = pd.read_csv("spam.csv", encoding = "ISO-8859-1")
y = np.array(dataset.iloc[:, 0].map({'ham':0, 'spam':1}))

## Converting the text data to a matrix of token counts.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(dataset.iloc[:, 1])

In [None]:
X = X.toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y 

array([0, 0, 1, ..., 0, 0, 0])

## Spliting the dataset into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y_train

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

## Training the Naive Bayes Model on the Training Set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting a single result

In [None]:
classifier.predict([X_test[0]])

array([0])

## Predicting the test set result

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [1 0]
 [0 0]
 [1 1]]


## Making the Confusion Matrix and Calculating the Accuracy.

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
matrix

array([[1062,  134],
       [  20,  177]])

In [None]:
accuracy

0.8894472361809045