In [116]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# 1. Cargar el dataset desde Hugging Face
dataset = load_dataset("sms_spam")
texts = dataset['train']['sms']
labels = dataset['train']['label']

# 2. Crear un DataFrame y dividir en entrenamiento y prueba 
df = pd.DataFrame({'text': texts, 'label': labels})
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=labels)


AttributeError: 'Series' object has no attribute 'type'

In [117]:
my_stopwords = {'ok', 'ur', 'll', 'gt', 'lt', 'just', 'know', 'www' , 'com'}
stop_words = list(text.ENGLISH_STOP_WORDS.union(my_stopwords))
vectorizer = CountVectorizer(
    stop_words=stop_words,
    lowercase=True,
    max_df=0.9,
    min_df=2,
    max_features=1000,
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b',
)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
y_train = train_labels.to_numpy()



In [118]:
cls = 1
X_spam = X_train[y_train == cls]
total_word_count_spam = X_spam.sum(axis=0) + 1
# P(palabra | spam)
prob_word_given_spam = np.asarray(total_word_count_spam / total_word_count_spam.sum()).flatten()
palabras_vocab = np.array(vectorizer.get_feature_names_out())
df_prob_spam = pd.DataFrame({
    'Palabra': palabras_vocab,
    'P(palabra | spam)': prob_word_given_spam
}).sort_values(by='P(palabra | spam)', ascending=False).head(10)
df_prob_spam

Unnamed: 0,Palabra,P(palabra | spam)
291,free,0.02908
890,txt,0.020817
840,text,0.015732
805,stop,0.015414
538,mobile,0.015096
124,claim,0.014778
696,reply,0.013666
661,prize,0.011441
101,cash,0.010011
895,uk,0.009534


In [119]:
# 4. Implementación del clasificador Naive Bayes manual
def train_naive_bayes(X, y):
    class_priors = {
        0: np.mean(y == 0),
        1: np.mean(y == 1)
    }
    cond_probs = {}
    for cls in [0, 1]:
        X_cls = X[y == cls]
        total_word_count = X_cls.sum(axis=0) + 1  # Laplace smoothing
        cond_probs[cls] = np.asarray(total_word_count / total_word_count.sum()).flatten()
    return class_priors, cond_probs

def predict_naive_bayes(X, class_priors, cond_probs):
    predictions = []
    for i in range(X.shape[0]):
        log_probs = {}
        word_counts = X[i].toarray().flatten()
        for cls in class_priors:
            log_prob = np.log(class_priors[cls])
            log_prob += np.sum(word_counts * np.log(cond_probs[cls]))
            log_probs[cls] = log_prob
        predictions.append(max(log_probs, key=log_probs.get))
    return predictions

# Entrenar modelo manual
priors, conds = train_naive_bayes(X_train, y_train)
manual_preds = predict_naive_bayes(X_test, priors, conds)


In [120]:
priors

{0: np.float64(0.8658892128279884), 1: np.float64(0.13411078717201166)}

In [None]:
# 5. Comparar con modelo de scikit-learn

sk_model = MultinomialNB()
sk_model.fit(X_train, train_labels)
sk_preds = sk_model.predict(X_test)

# Evaluación
manual_acc = accuracy_score(test_labels, manual_preds)
sk_acc = accuracy_score(test_labels, sk_preds)

# Mostrar resultados en una tabla con pandas
resultados = pd.DataFrame({
    "Mensaje": test_texts.values,
    "Etiqueta Real": test_labels.values,
    "Predicción Manual": manual_preds,
    "Predicción sklearn": sk_preds
})

print("=== Resultados de Evaluación ===")
print(f"Precisión modelo Naive Bayes manual:    {manual_acc:.4f}")
print(f"Precisión modelo MultinomialNB sklearn: {sk_acc:.4f}")
print("\nPrimeros 10 resultados comparativos:")
resultados.head(15)


=== Resultados de Evaluación ===
Precisión modelo Naive Bayes manual:    0.9785
Precisión modelo MultinomialNB sklearn: 0.9785

Primeros 10 resultados comparativos:


Unnamed: 0,Mensaje,Etiqueta Real,Predicción Manual,Predicción sklearn
0,No message..no responce..what happend?\n,0,0,0
1,At WHAT TIME should i come tomorrow\n,0,0,0
2,Come to my home for one last time i wont do an...,0,0,0
3,Get ur 1st RINGTONE FREE NOW! Reply to this ms...,1,1,1
4,See you there! \n,0,0,0
5,Great. So should i send you my account number.\n,0,0,0
6,WIN: We have a winner! Mr. T. Foley won an iPo...,1,1,1
7,Do we have any spare power supplies\n,0,0,0
8,Ok good then i later come find ü... C lucky i ...,0,0,0
9,Ok try to do week end course in coimbatore.\n,0,0,0
