## Spam Classifier from scratch (with Python)

In [13]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Read Data

In [14]:
# read data
data = pd.read_csv('datasets/SMSSpamCollection.csv', sep="\t", names=["label", "message"])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Clean Data

In [25]:
# clean data
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(0,len(data)):
    text = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    text = text.lower()
    text = text.split()

    text = [lemmatizer.lemmatize(t) for t in text if t not in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

### Create Datasets

In [50]:
# create bag of words aka X_traim
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X= cv.fit_transform(corpus).toarray()
X[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [51]:
y = pd.get_dummies(data['label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

### Create Model

In [58]:
# train, test, split
from sklearn.model_selection import train_test_split

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [60]:
# train using naives bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_classifier_model = MultinomialNB().fit(X_train, y_train)


In [61]:
# predict
y_predict = spam_classifier_model.predict(x_test)
y_predict

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [68]:
y_test

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

### Evaluate Model

In [69]:
# evaluate model
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_predict)
cm

array([[944,  11],
       [  9, 151]])

|Spam       | Predicted No | Predicted Yes  |
| ---       | ---          |  ---           |
| Actual No |944           | 11             |
| Actual Yes|9             | 151            |

In [67]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict)
accuracy

0.9820627802690582