In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


dataset = load_dataset("sms_spam")
texts = dataset['train']['sms']
labels = dataset['train']['label']

df = pd.DataFrame({'text': texts, 'label': labels})
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=labels)


In [2]:
my_stopwords = {'ok', 'ur', 'll', 'gt', 'lt', 'just', 'know', 'www' , 'com'}
stop_words = list(text.ENGLISH_STOP_WORDS.union(my_stopwords))
vectorizer = CountVectorizer(
    stop_words=stop_words,
    lowercase=True,
    max_df=0.9,
    min_df=2,
    max_features=1000,
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b',
)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
y_train = train_labels.to_numpy()

In [3]:
cls = 1
X_spam = X_train[y_train == cls]
total_word_count_spam = X_spam.sum(axis=0) + 1
# P(palabra | spam)
prob_word_given_spam = np.asarray(total_word_count_spam / total_word_count_spam.sum()).flatten()
palabras_vocab = np.array(vectorizer.get_feature_names_out())
df_prob_spam = pd.DataFrame({
    'Palabra': palabras_vocab,
    'P(palabra | spam)': prob_word_given_spam
}).sort_values(by='P(palabra | spam)', ascending=False).head(10)
df_prob_spam

Unnamed: 0,Palabra,P(palabra | spam)
291,free,0.02908
890,txt,0.020817
840,text,0.015732
805,stop,0.015414
538,mobile,0.015096
124,claim,0.014778
696,reply,0.013666
661,prize,0.011441
101,cash,0.010011
895,uk,0.009534


In [4]:
# clasificador Naive Bayes manual
def train_naive_bayes(X, y):
    class_priors = {
        0: np.mean(y == 0),
        1: np.mean(y == 1)
    }
    cond_probs = {}
    for cls in [0, 1]:
        X_cls = X[y == cls]
        total_word_count = X_cls.sum(axis=0) + 1  
        cond_probs[cls] = np.asarray(total_word_count / total_word_count.sum()).flatten()
    return class_priors, cond_probs

def predict_naive_bayes(X, class_priors, cond_probs):
    predictions = []
    for i in range(X.shape[0]):
        log_probs = {}
        word_counts = X[i].toarray().flatten()
        for cls in class_priors:
            log_prob = np.log(class_priors[cls])
            log_prob += np.sum(word_counts * np.log(cond_probs[cls]))
            log_probs[cls] = log_prob
        predictions.append(max(log_probs, key=log_probs.get))
    return predictions

priors, conds = train_naive_bayes(X_train, y_train)
manual_preds = predict_naive_bayes(X_test, priors, conds)


In [5]:
print("A priori, probabildiad de ser spam o no")
priors

A priori, probabildiad de ser spam o no


{0: np.float64(0.8658892128279884), 1: np.float64(0.13411078717201166)}

In [6]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, train_labels)
lr_preds = lr_model.predict(X_test)

svm_model = LinearSVC()
svm_model.fit(X_train, train_labels)
svm_preds = svm_model.predict(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, train_labels)
rf_preds = rf_model.predict(X_test)

knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, train_labels)
knn_preds = knn_model.predict(X_test)

lr_acc = accuracy_score(test_labels, lr_preds)
svm_acc = accuracy_score(test_labels, svm_preds)
rf_acc = accuracy_score(test_labels, rf_preds)
knn_acc = accuracy_score(test_labels, knn_preds)

sk_model = MultinomialNB()
sk_model.fit(X_train, train_labels)
sk_preds = sk_model.predict(X_test)

manual_acc = accuracy_score(test_labels, manual_preds)
sk_acc = accuracy_score(test_labels, sk_preds)

print("Resultados de Evaluación")
print(f"{'Naive Bayes manual:':30} {manual_acc:.4f}")
print(f"{'MultinomialNB sklearn:':30} {sk_acc:.4f}")
print(f"{'Logistic Regression:':30} {lr_acc:.4f}")
print(f"{'SVM (LinearSVC):':30} {svm_acc:.4f}")
print(f"{'Random Forest:':30} {rf_acc:.4f}")
print(f"{'K-Nearest Neighbors:':30} {knn_acc:.4f}")

resultados = pd.DataFrame({
    "Mensaje": test_texts.values,
    "Etiqueta Real": test_labels.values,
    "Predicción Manual": manual_preds,
    "MultinomialNB sklearn": sk_preds,
    "Logistic Regression": lr_preds,
    "SVM (LinearSVC)": svm_preds,
    "Random Forest": rf_preds,
    "K-Nearest Neighbors": knn_preds
})

print("\n10 resultados aleatorios comparativos:")
resultados.sample(10, random_state=42)


Resultados de Evaluación
Naive Bayes manual:            0.9785
MultinomialNB sklearn:         0.9785
Logistic Regression:           0.9812
SVM (LinearSVC):               0.9785
Random Forest:                 0.9758
K-Nearest Neighbors:           0.9489

10 resultados aleatorios comparativos:


Unnamed: 0,Mensaje,Etiqueta Real,Predicción Manual,MultinomialNB sklearn,Logistic Regression,SVM (LinearSVC),Random Forest,K-Nearest Neighbors
265,";-( oh well, c u later\n",0,0,0,0,0,0,0
101,From next month get upto 50% More Calls 4 Ur s...,1,1,1,1,1,0,0
1045,You are now unsubscribed all services. Get ton...,1,1,1,1,1,1,1
792,Love it! Daddy will make you scream with pleas...,0,0,0,0,0,0,0
902,Yes i have. So that's why u texted. Pshew...mi...,0,0,0,0,0,0,0
467,"Ha ha nan yalrigu heltini..Iyo kothi chikku, u...",0,0,0,0,0,0,0
781,I am not sure about night menu. . . I know onl...,0,0,0,0,0,0,0
306,"GOD ASKED, ""What is forgiveness?"" A little chi...",0,0,0,0,0,0,0
128,Hmmm:)how many players selected?\n,0,0,0,0,0,0,0
332,If anyone calls for a treadmill say you'll buy...,0,0,0,0,0,0,0
