In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding="ISO-8859-1") # had to add the encoding because read_csv() wasn't supporting 'utf-8'
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True, axis=1)
df = df.rename(columns={"v1":"target", "v2":"message"})
df

In [None]:
df.shape

# Build a simple Bag of Words from Scratch

In [None]:
docs = ["Here at the Wall", 
        "What are the main reasons for.....", 
        "There are 700 possiblities that Alex will meet Alex Prime", 
       "Alpha prime is the member of Prime Groups",
       "Is that all you got ?"]


# Preprocess the text 


def preprocess(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    text = text.split(" ")
    
    return text


preprocessed_docs = [preprocess(d) for d in docs]

print(preprocessed_docs)
    

In [None]:
def create_bow(docs):
    
    bow = []
    for d in docs:
        count = dict()
        for words in d:
            count[words] = count.get(words, 0) + 1
        bow.append(count)
        
    return bow

create_bow(preprocessed_docs)

# Dealing with the DataSet at Hand

In [None]:
def preprocess_text(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    text = text.split()
    text = ' '.join(list(filter(lambda x : x not in ['', ' '], text)))
    return text

In [None]:
df.message = df.message.apply(preprocess_text)
df

In [None]:
# Split the data into Train/Test
X_train, X_test, y_train, y_test = train_test_split(df.message.values, df.target.values, test_size=0.1, stratify=df.target)

In [None]:
# In sklearn you can create a BOW using the CountVectorizer() function

bow = CountVectorizer(stop_words='english')

In [None]:
# Fit the bag of words on the training docs
bow.fit(X_train)

In [None]:
X_train = bow.transform(X_train)
X_test = bow.transform(X_test)

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Accuracy : {accuracy_score(y_test, naive_bayes.predict(X_test)):.3f}')
print(f'Precision : {precision_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')
print(f'Recall : {recall_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')
print(f'F1-Score : {f1_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')

# Testing the Model

In [None]:
# Test sample input

text = "You've Won! Winning an unexpected prize sounds great in theory. ..."
p_text = preprocess_text(text)
print(p_text)
p_text = bow.transform([p_text])
naive_bayes.predict(p_text)