In [2]:
# Import necessary libraries

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to C:\Users\Mini
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mini
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('2cls_spam_text_cls.csv')

display(df)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Split data into features/input (x) and label (y)
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

print("n_samples:", len(messages))
display(messages[:3])



n_samples: 5572


['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [5]:
txt = 'Hi Sam!'
x = "mSA"
y = "eJo"
mytable = str.maketrans(x, y)
print(txt.translate(mytable))

Hi Jae!


In [6]:
txt = 'Good night Sam!'
x = "mSA"
y = 'eJo'
z = "odnght"

translator = str.maketrans(x, y, z)
print(txt.translate(translator))

G i Jae!


In [7]:
print(ord('m'), ord('e'))

109 101


In [8]:
# Preprocessing

# String to vector with length v => technique: bag of words
def lowercase(text):
    return text.lower()

def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.PorterStemmer()

    return [stemmer.stem(token) for token in tokens]





In [9]:
def preprocess_text(text):
    text = lowercase(text)
    text = remove_punctuations(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)

    return tokens


input_text = "The cat's toys are scattered everywhere!"

print(preprocess_text(input_text))


['cat', 'toy', 'scatter', 'everywher']


In [10]:
# Preprocess each message to a list of words
message_lst = [preprocess_text(message) for message in messages]


In [None]:
print(len(messages))
print(messages[0])

5572
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [None]:
def create_dictionary(messages):
    dictionary = []

    for message in messages:
        for token in message:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary


# Build a dictionary from messages
dictionary = create_dictionary(message_lst)

display(dictionary[:10])
print(len(dictionary))



['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la']

8166


In [None]:
# Create feature from string to vector

def create_feature(sentence, dictionary):
    feature = np.zeros(len(dictionary))

    tokens = preprocess_text(sentence)
    for token in tokens:
        if token in dictionary:
            feature[dictionary.index(token)] += 1
    
    return feature

input_message = messages[0]
print(input_message)
feature = create_feature(input_message, dictionary)
print(feature.shape)
print(feature)

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
(8166,)
[1. 1. 1. ... 0. 0. 0.]


In [None]:
# Encode class from string to number

le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

print(type(y))

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]
<class 'numpy.ndarray'>


In [None]:
# Split dataset into 3 sets: train, val, test

VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0
IS_SHUFFLE = True

X = np.array([create_feature(message, dictionary) for message in messages])

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=VAL_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=TEST_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
)

In [None]:
print(f"Number of training samples: {X_train.shape[0]} ~ {round(X_train.shape[0] / len(y), 2)}")
print(f"Number of val samples: {X_val.shape[0]} ~ {round(X_val.shape[0] / len(y), 2)}")
print(f"Number of test samples: {X_test.shape[0]} ~ {round(X_test.shape[0] / len(y), 2)}")



Number of training samples: 3899 ~ 0.7
Number of val samples: 1115 ~ 0.2
Number of test samples: 558 ~ 0.1


In [None]:
%%time
model = GaussianNB()
print('Start training...')
model = model.fit(X_train, y_train)
print('Training completed!')

Start training...
Training completed!
CPU times: total: 78.1 ms
Wall time: 513 ms


In [None]:
# Test on validation and test datasets
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Val accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [None]:
# Implement a function to predict whether a message is spam or not

def predict(message, model=model, dictionary=dictionary):
    processed_message = preprocess_text(message)
    feature = create_feature(message, dictionary)
    print(feature)
    feature = np.array(feature).reshape(1, -1)
    print(feature)
    prediction = model.predict(feature)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls


test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input)
print(f"Prediction: {prediction_cls}")

[0. 0. 0. ... 0. 0. 0.]
[[0. 0. 0. ... 0. 0. 0.]]
Prediction: ham


In [None]:
#######
# Multiple choice
#######

# Question 8:
result = remove_stopwords(["Pho", "is", "a", "popular", "Vietnamese", "noodle ", "soup"
])

print(result)

['Pho', 'popular', 'Vietnamese', 'noodle ', 'soup']
