In [146]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Specify the URL of the CSV file
url = "https://storm.cis.fordham.edu/~gweiss/classes/cisc5660/data/sms-spam-dataset.csv"

# Read CSV data
data = pd.read_csv(url, encoding = "latin-1")

# Extract Text Data
X_data = data['Text']
y = data['Class']

ngrams = (1, 1)

# Create CountVectorizer
vectorizer = CountVectorizer(min_df=2, lowercase=True, ngram_range=ngrams, stop_words='english', max_features=500)

X_data_counts = vectorizer.fit_transform(X_data.values.astype('U'))

In [147]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_data_counts)
X_data_tf = tf_transformer.transform(X_data_counts)

In [148]:
print(X_data_counts[0:1])
print(vectorizer.vocabulary_)

  (0, 39)	1
  (0, 173)	1
  (0, 482)	1
  (0, 171)	1
  (0, 458)	1
{'available': 39, 'great': 173, 'world': 482, 'got': 171, 'wat': 458, 'ok': 297, 'lar': 223, 'wif': 470, 'free': 153, 'entry': 139, 'win': 473, 'final': 146, 'text': 401, 'receive': 341, 'question': 334, 'txt': 438, 'rate': 336, 'cs': 101, 'apply': 33, 'dun': 131, 'say': 356, 'early': 133, 'dont': 122, 'think': 410, 'goes': 166, 'hey': 195, 'weeks': 465, 'word': 478, 'id': 208, 'like': 236, 'fun': 158, 'xxx': 488, 'send': 363, '50': 13, 'brother': 62, 'speak': 387, 'set': 367, 'friends': 155, 'network': 284, 'customer': 102, 'selected': 362, 'prize': 330, 'claim': 82, 'code': 86, 'valid': 445, 'hours': 202, 'mobile': 274, 'update': 441, 'latest': 226, 'colour': 89, 'camera': 71, 'im': 210, 'gonna': 169, 'home': 198, 'soon': 385, 'want': 455, 'talk': 398, 'stuff': 394, 'tonight': 426, 'ive': 213, 'today': 419, 'cash': 74, '100': 1, 'pounds': 326, 'cost': 99, '150p': 5, 'day': 107, '16': 7, 'reply': 343, 'urgent': 443, 'won'

In [149]:
names_ = vectorizer.get_feature_names_out()
print(names_)

['10' '100' '1000' '10p' '150' '150p' '150ppm' '16' '18' '1st' '2000'
 '250' '2nd' '50' '500' '5000' '750' '800' '8007' '86688' 'able' 'abt'
 'account' 'actually' 'address' 'aft' 'afternoon' 'ah' 'aight' 'alright'
 'amp' 'angry' 'answer' 'apply' 'ard' 'ask' 'asked' 'attempt' 'auction'
 'available' 'await' 'award' 'awarded' 'away' 'awesome' 'babe' 'baby'
 'bad' 'beautiful' 'bed' 'believe' 'best' 'better' 'big' 'birthday' 'bit'
 'bonus' 'book' 'bored' 'box' 'boy' 'bring' 'brother' 'bt' 'bus' 'busy'
 'buy' 'called' 'calling' 'calls' 'came' 'camera' 'car' 'care' 'cash'
 'cause' 'chance' 'change' 'charge' 'chat' 'check' 'chikku' 'claim'
 'class' 'close' 'club' 'code' 'collect' 'collection' 'colour' 'com'
 'come' 'comes' 'coming' 'company' 'congrats' 'contact' 'cool' 'cos'
 'cost' 'coz' 'cs' 'customer' 'da' 'dad' 'dat' 'date' 'day' 'days' 'dear'
 'decimal' 'delivery' 'den' 'details' 'did' 'didnt' 'dinner' 'dis' 'does'
 'doesnt' 'doing' 'don' 'dont' 'double' 'draw' 'dreams' 'drink' 'drive'
 '

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_data_tf, y, test_size=0.2, random_state=42)
classifier = MultinomialNB()   ###
classifier.fit(X_train, y_train)

In [151]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score
print("Multinomial NB with ngrams (1,1), unbalanced.")
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, pos_label='spam')
print("Precision:", precision)

recall = recall_score(y_test, y_pred, pos_label='spam')
print("Recall:", recall)

confusion_matrix_result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix_result)

# Calculate F1-measure (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred, pos_label='spam')
print("F1-measure:", f1)

Multinomial NB with ngrams (1,1), unbalanced.
Accuracy: 0.9713004484304932
Precision: 0.9538461538461539
Recall: 0.8266666666666667
Confusion Matrix:
 [[959   6]
 [ 26 124]]
F1-measure: 0.8857142857142857


In [152]:
classifier.feature_log_prob_[0, :].argsort()[::-1]

array([210, 297, 217, 250,  91, 174, 122, 171, 170, 220, 209, 236, 442,
       416, 168, 198, 386, 103, 249, 455, 245, 282, 107, 225, 196, 419,
       410, 363, 498, 195, 114, 461, 296, 399, 318, 406, 493, 289, 344,
       423, 241, 109, 490, 313,  35, 120, 356, 173,  45, 199, 311, 480,
       254, 213, 458, 353, 261, 185, 395, 277, 263, 340,  66, 147,  30,
        83, 169, 489, 401, 115, 404, 467, 232, 275, 462, 426, 288, 449,
       181,  93, 408, 421, 235, 315, 364, 293, 499,  98, 227, 270, 262,
       385, 497, 279, 496, 224, 162, 453, 144, 153, 286, 379, 474, 413,
       203, 477, 495, 187,  97, 192, 204, 131,  73, 444, 328, 309, 450,
       191, 214, 251, 394, 155, 434, 398, 468,  55,  28, 188, 412,  72,
       247, 338, 332, 376, 148, 342,  52, 116, 331, 257,  27, 176, 460,
        80, 348, 135, 409, 223, 260, 403, 371, 152, 316, 405, 459,  64,
       234, 240,  46, 435,  53, 242, 491, 108, 295, 310, 369, 306, 266,
       354, 388,  23, 430, 105, 150, 485, 392, 343,  70, 358, 26

In [153]:
def topTenWords(vocabulary):
    from collections import defaultdict
    log_probs = classifier.feature_log_prob_[1, :]
  
    word_probs = defaultdict(float)
    for i, word in enumerate(vocabulary):
        word_probs[word] = log_probs[i]


    sorted_words = sorted(word_probs.items(), key=lambda item: item[1], reverse=True)


    num_to_print = 10
    print("Top", num_to_print, "words most likely to appear in spam:")
    for i in range(num_to_print):
        word, prob = sorted_words[i]
        print(f"{word}: {prob:.4f}")
        
topTenWords(vectorizer.vocabulary_)

Top 10 words most likely to appear in spam:
need: -3.8589
bring: -4.1194
contact: -4.2767
baby: -4.3095
tones: -4.3422
message: -4.3609
eve: -4.4253
guy: -4.4420
sleeping: -4.4422
working: -4.5471


In [154]:
from sklearn.svm import SVC    ####

print("SVC with ngrams (1,1), unbalanced.")

classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
precision = precision_score(y_test, y_pred, pos_label='spam')
print("Precision:", precision)
recall = recall_score(y_test, y_pred, pos_label='spam')
print("Recall:", recall)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix_result)
f1 = f1_score(y_test, y_pred, pos_label='spam')
print("F1-measure:", f1)

SVC with ngrams (1,1), unbalanced.
Accuracy: 0.97847533632287
Precision: 0.9701492537313433
Recall: 0.8666666666666667
Confusion Matrix:
 [[961   4]
 [ 20 130]]
F1-measure: 0.915492957746479


In [155]:
ngrams = (1, 3) ####
classifier = MultinomialNB()
vectorizer = CountVectorizer(min_df=2, lowercase=True, ngram_range=ngrams, stop_words='english', max_features=500)
X_data_counts = vectorizer.fit_transform(X_data.values.astype('U'))
tf_transformer = TfidfTransformer(use_idf=True).fit(X_data_counts)
X_data_tf = tf_transformer.transform(X_data_counts)


def runExperiment(ngrams):    
    X_train, X_test, y_train, y_test = train_test_split(X_data_tf, y, test_size=0.2, random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    precision = precision_score(y_test, y_pred, pos_label='spam')
    print("Precision:", precision)
    recall = recall_score(y_test, y_pred, pos_label='spam')
    print("Recall:", recall)
    confusion_matrix_result = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", confusion_matrix_result)
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    print("F1-measure:", f1)
    
    
print("Multinomial NB with ngrams (1,3), unbalanced.")
runExperiment(ngrams)
topTenWords(vectorizer.vocabulary_)

Multinomial NB with ngrams (1,3), unbalanced.
Accuracy: 0.968609865470852
Precision: 0.952755905511811
Recall: 0.8066666666666666
Confusion Matrix:
 [[959   6]
 [ 29 121]]
F1-measure: 0.8736462093862815
Top 10 words most likely to appear in spam:
trying: -3.8405
den: -4.0941
draw: -4.2701
hurt: -4.2957
weekly: -4.3262
date: -4.3617
guy: -4.4161
heart: -4.4170
oso: -4.4325
xmas: -4.5618


In [156]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

ngrams = (1, 3) ####
classifier = MultinomialNB()
vectorizer = CountVectorizer(min_df=2, lowercase=True, ngram_range=ngrams, stop_words='english', max_features=500)
X_data_counts = vectorizer.fit_transform(X_data.values.astype('U'))
tf_transformer = TfidfTransformer(use_idf=True).fit(X_data_counts)
X_data_tf = tf_transformer.transform(X_data_counts)
X_data_tf, y = smote.fit_resample(X_data_tf, y)  # Reshape for SMOTE

print("Multinomial NB with ngrams (1,3), SMOTE balanced.")
runExperiment(ngrams)
topTenWords(vectorizer.vocabulary_)

Multinomial NB with ngrams (1,3), SMOTE balanced.
Accuracy: 0.9310880829015544
Precision: 0.9043824701195219
Recall: 0.9608465608465608
Confusion Matrix:
 [[889  96]
 [ 37 908]]
F1-measure: 0.9317598768599281
Top 10 words most likely to appear in spam:
trying: -3.6754
den: -3.9259
draw: -4.0641
weekly: -4.1022
date: -4.1769
hurt: -4.1923
oso: -4.2674
heart: -4.2716
xmas: -4.2730
guy: -4.2955
