<a href="https://colab.research.google.com/github/qhung23125005/AIO/blob/main/AIO24/Module2/Project/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This project aims to classify whether a text is a spam or not

In [8]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


#Preprocess data

##Load data

In [1]:
! gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 31.0MB/s]


In [4]:
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
messages = df['Message']. values.tolist ()
labels = df['Category']. values.tolist ()

##Process data

Step 1: Convert all text to lower case

Step 2: Eliminate all punctuation marks

Step 3: Split the text into individual words (tokens)

Step 4: Filter out common words that don't have meaning (the, a,...)

Step 5: Reduce words to their root form, grouping similar words together

In [5]:
def lower_case(text):
  return text.lower()

def punctuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def tokenization(text):
  return nltk.word_tokenize(text)

def remove_stopwords(tokens):
  stopwords = nltk.corpus.stopwords.words('english')
  return [word for word in tokens if word not in stopwords]

def stemming(tokens):
  stemmer = nltk.PorterStemmer()
  return [stemmer.stem(word) for word in tokens]

In [13]:
def preprocess_text(text):
  text = lower_case(text)
  text = punctuation_removal(text)
  tokens = tokenization(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)
  return tokens

messages = [ preprocess_text ( message ) for message in messages ]

##Create a dictionary and extract features

In [15]:
def create_dictionary(messages):
  dictionary = []
  for message in messages:
    for word in message:
      if word not in dictionary:
        dictionary.append(word)

  return dictionary

In [16]:
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for word in tokens:
    if word in dictionary:
        features[dictionary.index(word)] += 1
  return features

In [17]:
dictionary = create_dictionary(messages)
X = np.array([create_features(message, dictionary) for message in messages])

##Label and Divide the dataset

In [31]:
le = LabelEncoder()
y = le.fit_transform(labels)

print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

#count spam
print(f'Spam: {sum(y)}')
print(f'Ham: {len(y) - sum(y)}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]
Spam: 747
Ham: 4825


In [20]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    shuffle = True,
                                                    random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=VAL_SIZE,
                                                    shuffle = True,
                                                    random_state=SEED)

#Training and Valididate

In [23]:
model = GaussianNB()
print('Starting training')
model.fit(X_train, y_train)
print('Training finished')

Starting training
Training finished


In [24]:
y_val_pred = model . predict ( X_val )
y_test_pred = model . predict ( X_test )

print('Validation accuracy: ', accuracy_score(y_val, y_val_pred))
print('Test accuracy: ', accuracy_score(y_test, y_test_pred))

Validation accuracy:  0.8708751793400287
Test accuracy:  0.882051282051282


In [32]:
def predict(text, model, dictionary):
  processed_text = preprocess_text ( text )
  features = create_features(text, dictionary)
  features = np. array(features).reshape(1 , -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]

  return prediction_cls

test_input = 'aaaaaaaaaaaaaaaaaaaaaa'
prediction_cls = predict ( test_input , model , dictionary )
print (f'Prediction : { prediction_cls }')

Prediction : spam
