In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('spam_ham_dataset.csv')
df

In [None]:
df.info

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.duplicated().sum()

In [None]:
sns.histplot(df, x='label', discrete=True, stat='density')

In [None]:
sns.pairplot(df, hue='label_num')

In [None]:
import nltk

nltk.download('stopwords')
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:

import re
import unicodedata

from string import punctuation
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
 
# print(stopwords.words('english'))

STOPWORDS = stopwords.words('english')
STEMMER = PorterStemmer()
LEMMANTIZER = WordNetLemmatizer()

def process_text(
    text, tokenized=False, lowercase=True, remove_punctuation=True,
    remove_digit=True, remove_stopwords=True, add_stopwords=[],
    ngrams=None, use_stem_lemmantize='stem',
):
    stopwords_ = set(STOPWORDS).union(set(add_stopwords))
    stopwords_ = set([s.lower() for s in stopwords_])
    # normalize, lowercase, tokenize, remove punctuation
    tokens = unicodedata.normalize('NFKC', text)
    tokens = tokens.lower() if lowercase else tokens
    if not tokenized:
        if remove_punctuation:
            pattern = '[{}\s]+'.format(punctuation)
        else:
            pattern = '([{}]+)'.format(punctuation)
            tokens = re.sub(pattern, r' \1 ', tokens)
            pattern = '\s+'
        tokens = re.split(pattern, tokens)
    # processs
    p_tokens = list()
    for token in tokens:
        # remove digit
        if remove_digit:
            token = re.sub('\d+', '', token)
        # remove stopwords
        if remove_stopwords:
            token = '' if token.lower() in stopwords_ else token
        p_tokens.append(token)
    tokens = [t for t in p_tokens if len(t) > 0]
    # stemming or lemmantizing
    if use_stem_lemmantize == 'stem':
        tokens = [STEMMER.stem(t) for t in tokens]
    elif use_stem_lemmantize == 'lemmantize':
        tokens = [LEMMANTIZER.lemmatize(t) for t in tokens]
    # make n-gram
    if not (ngrams is None):
        ngrams_list = []
        min_ngram, max_ngram = ngrams
        for n in range(min_ngram, max_ngram + 1):
            ngrams_list.append(
                [' '.join(tokens[i: i+n])
                 for i in range(len(tokens) - n)]
            )
        tokens = [t for ngrams_l in ngrams_list for t in ngrams_l]
    return tokens

In [None]:
# test
i = 605
text = df.loc[i, 'text']

print(process_text(text, remove_punctuation=True,
                   ngrams=(1, 2)))

In [None]:
X = df.label
y = df.text

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=0.3,
    random_state=2023
)


In [None]:
vectorizer = CountVectorizer(
    analyzer=lambda x: process_text(x, ngrams=(1, 2),
                                    use_stem_lemmantize='stem'),
    max_df=0.85,
    min_df=1,
)

In [None]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
len(vectorizer.vocabulary_)

In [None]:
# define and train model
nb_model = MultinomialNB(alpha=0.01)
nb_model.fit(X_train, y_train)


In [None]:
label_names = sorted(df['label'].unique())
label_names

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_names))

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, columns=label_names, index=label_names)
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
sns.heatmap(df_cm, cmap="Blues", annot=True, fmt='.0f'); # font size