In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Step 1: Read data
filename = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
df = pd.read_csv(filename, encoding = "ISO-8859-1")
df.head()

data = df.to_numpy() # Convert dataframe into array
le = LabelEncoder() # Used to encode target values (y)

In [None]:
# Create vectors for target values (y) and input values (X)
y = data[:, 0]
X = data[:, 1]

print(X)
print(y)

In [None]:
# Step 2, 3, & 4: Tokenization, Stopword Removal and Stemming
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

def getStemmedText(text):
    text = text.lower()
    # Tokenize
    tk_text = tokenizer.tokenize(text)
    # Removing stopwords
    stopped_text = [word for word in tk_text if word not in sw]
    # Stemming
    stemmed_text = [ps.stem(word) for word in stopped_text]
    
    clean_text = ' '.join(stemmed_text)
    return clean_text

def getStemmedDocument(document):
    d = []
    for doc in document:
        d.append(getStemmedText(doc))
    return d

stemmed_document = getStemmedDocument(X)
# View sample of stemmed document
stemmed_document[:10]

In [None]:
# Step 5: Vectorization
cv = CountVectorizer()
vectorized_corpus = cv.fit_transform(stemmed_document)
X = vectorized_corpus.todense() 
# Returns a matrix with vectorized words

In [None]:
# Step 6: Train Naive Bayes model based on 2/3 of the data set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Test the model's accuracy!
model.score(X_test, y_test)

In [None]:
# Applying model to external data
sample_spam = ["""
    Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim it now.
"""]

def prepare_message(message):
    d = getStemmedDocument(message)
    # Do not run fit_transform, which trains the data!
    return cv.transform(d)

input_message = prepare_message(sample_spam)
y_pred = model.predict(input_message)
print(y_pred)