# PREPARE LIBRARY: install all need packages 

In [1]:
!pip install -qq faiss-cpu
!pip install -qq transformers
!pip install -qq pandas
!pip install -qq numpy
!pip install -qq scikit-learn
!pip install -qq tqdm

In [4]:
!pip install torch torchvision torchaudio


Collecting torch
  Downloading torch-2.7.1-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.7.1-cp312-none-macosx_11_0_arm64.whl (68.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchaudio-2.7.1-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m

In [14]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/questionminded/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/questionminded/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/questionminded/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Download and read the dataset

In [2]:
!gdown 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /Users/questionminded/tài liệu học tập/AIVN/2cls_spam_text_cls.csv
100%|████████████████████████████████████████| 486k/486k [00:00<00:00, 3.63MB/s]


In [36]:
DATASET_PATH = '/Users/questionminded/tài liệu học tập/AIVN/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [37]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()
#change the columns from string to list type

In [8]:
messages[1]

'Ok lar... Joking wif u oni...'

In [9]:
labels[1]

'ham'

# PREPROCESS ###

## label encoder

In [12]:
# create label_dict= {0: ' ham' 1:'spam'}

#using LabelEncoder library
label_encoder = LabelEncoder()
transformed_label = label_encoder.fit_transform(labels)

print(f'Classes: {label_encoder.classes_}')
print(f'Encoded labels: {transformed_label}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [59]:
transformed_label.shape

(5572,)

## preprocess String-typed data

In [30]:
def lowercase(text):
    return text.lower()

def punctuation_removal(text):
    translator = text.maketrans('','', string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return text.split() #or can use directly nltk.word_tokenize(text)

def remove_stopwords (tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming (tokens)

    return tokens

In [38]:
messages = [preprocess_text(message) for message in messages]

## create dictionary of words

In [49]:
def create_dictionary(messages):
    dictionary = [] #create blank list

    for tokens in messages: #first loop for each sentence
        for token in tokens: #second loop for each words in the sentence
            if token not in dictionary:
                dictionary.append(token)
    return dictionary

dictionary = create_dictionary(messages)


In [51]:
len(dictionary)

8190

## create features (for each sentence)

In [52]:
def create_features (tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary: 
            features[dictionary.index(token)] += 1
    return features

In [53]:
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [57]:
X.shape

(5572, 8190)

# TRAIN and TEST model

In [60]:
test_size = 0.2
seed = 0

X_train, X_test, Y_train, Y_test = train_test_split(X, transformed_label, test_size = test_size, shuffle = True, random_state = seed) 

In [62]:
%%time
model = MultinomialNB() # GaussianNB()
print('Start training...')
model = model.fit(X_train, Y_train)
print('Training completed!')

Start training...
Training completed!
CPU times: user 145 ms, sys: 57.1 ms, total: 202 ms
Wall time: 177 ms


## model assessment

In [64]:
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_test_pred)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.9820627802690582


# prediction

In [None]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1) # from matrix to vector 1x8190 to predict in model
    print(features.shape)
    prediction = model.predict(features)
    print(prediction)
    prediction_cls = label_encoder.inverse_transform(prediction)[0]

    return prediction_cls

In [66]:
test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

(1, 8190)
[0]
Prediction: ham
