In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix
)
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import norm

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✅ Librerías importadas correctamente")

---
# TEMA 6.III - NAIVE BAYES
## Clasificación Probabilística Ingenua

---
## SECCIÓN 1: Fundamentos de Probabilidad

### 1.1 Concepto de Probabilidad Condicional

$$P(A|B) = \frac{P(A \cap B)}{P(B)}$$

Probabilidad de A sabiendo que B ya ocurrió.

In [None]:
# Ejemplo: Test de enfermedad
P_enfermo = 0.01
P_sano = 0.99
P_test_pos_dado_enfermo = 0.99
P_test_pos_dado_sano = 0.05

P_test_pos = (P_test_pos_dado_enfermo * P_enfermo) + (P_test_pos_dado_sano * P_sano)
P_enfermo_dado_test_pos = (P_test_pos_dado_enfermo * P_enfermo) / P_test_pos

print("=" * 60)
print("TEST DE ENFERMEDAD - TEOREMA DE BAYES")
print("=" * 60)
print(f"\nP(Enfermo | Test+) = {P_enfermo_dado_test_pos:.1%}")
print(f"\nInterpretación: Incluso dando positivo, solo hay {P_enfermo_dado_test_pos:.1%} probabilidad de estar enfermo")

### 1.2 El Teorema de Bayes

$$P(\text{Clase}|\text{Datos}) = \frac{P(\text{Datos}|\text{Clase}) \cdot P(\text{Clase})}{P(\text{Datos})}$$

En clasificación:
- **P(Clase | Datos)**: Probabilidad posterior (lo que queremos)
- **P(Datos | Clase)**: Verosimilitud
- **P(Clase)**: Prior
- **P(Datos)**: Evidencia

---
## SECCIÓN 2: Naive Bayes - La Suposición Ingenua

**Asume que todas las características son independientes dado la clase.**

¿Es realista? ❌ NO

¿Por qué funciona? ✅ 
- Simplifica cálculos
- Da buenos resultados
- Es rápido y necesita pocos datos

---
## SECCIÓN 3: Naive Bayes Gaussiano

### 3.1 Distribución Normal por clase

Asumimos que cada característica sigue una distribución normal:

$$P(x_i|\text{Clase}) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x_i - \mu)^2}{2\sigma^2}\right)$$

In [None]:
# Cargar Iris
iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = pd.Series(iris.target, name="species")

# Visualizar
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
feature_idx = 0
feature_name = iris.feature_names[feature_idx]

for class_idx in np.unique(y_iris):
    data = X_iris[y_iris == class_idx][feature_name]
    axes[0].hist(data, alpha=0.6, label=iris.target_names[class_idx], bins=15)

axes[0].set_xlabel(feature_name)
axes[0].set_ylabel('Frecuencia')
axes[0].set_title('Distribución por Clase')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

x_range = np.linspace(X_iris[feature_name].min(), X_iris[feature_name].max(), 200)
for class_idx in np.unique(y_iris):
    data = X_iris[y_iris == class_idx][feature_name]
    mu = data.mean()
    sigma = data.std()
    y_dist = norm.pdf(x_range, mu, sigma)
    axes[1].plot(x_range, y_dist, lw=2, label=iris.target_names[class_idx])
    axes[1].fill_between(x_range, y_dist, alpha=0.2)

axes[1].set_xlabel(feature_name)
axes[1].set_ylabel('Densidad')
axes[1].set_title('Distribuciones Normales')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 3.2 Entrenamiento

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

nb_gaussian = GaussianNB()
nb_gaussian.fit(X_train, y_train)

print("=" * 60)
print("PARÁMETROS APRENDIDOS")
print("=" * 60)

for class_idx, class_name in enumerate(iris.target_names):
    print(f"\n{class_name}:")
    print(f"  Prior: {nb_gaussian.class_prior_[class_idx]:.3f}")

### 3.3 Predicción

In [None]:
y_pred = nb_gaussian.predict(X_test)

print("=" * 70)
print("EVALUACIÓN")
print("=" * 70)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nReporte:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.xlabel('Predicción')
plt.ylabel('Real')
plt.title('Matriz de Confusión')
plt.tight_layout()
plt.show()

---
## SECCIÓN 4: Naive Bayes Multinomial (Textos)

Para clasificar textos usando conteos de palabras.

In [None]:
reviews = [
    "This movie is amazing and fantastic",
    "I loved this film absolutely wonderful",
    "Terrible movie waste of time",
    "This is a bad and boring film",
    "Great acting and excellent story",
    "Horrible and disappointing",
    "Best movie ever made",
    "Worst film I have ever seen"
]

labels = [1, 1, 0, 0, 1, 0, 1, 0]

print("RESEÑAS DE PELÍCULAS")
print("=" * 60)
for i, (review, label) in enumerate(zip(reviews, labels)):
    sentiment = "POS" if label == 1 else "NEG"
    print(f"{i+1}. [{sentiment}] {review}")

In [None]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
X_text = vectorizer.fit_transform(reviews)

print("\nVocabulario:", ', '.join(vectorizer.get_feature_names_out()))

nb_multi = MultinomialNB()
nb_multi.fit(X_text, labels)

new_reviews = [
    "This movie is amazing",
    "Terrible and boring film"
]

X_new = vectorizer.transform(new_reviews)
predictions = nb_multi.predict(X_new)
probabilities = nb_multi.predict_proba(X_new)

print("\n" + "=" * 60)
print("PREDICCIONES")
print("=" * 60)

for review, pred, probs in zip(new_reviews, predictions, probabilities):
    sentiment = "POSITIVO" if pred == 1 else "NEGATIVO"
    print(f"\n{review}")
    print(f"  Predicción: {sentiment}")
    print(f"  Confianza: {max(probs):.1%}")

---
## SECCIÓN 5: Detección de Spam

Caso real: Clasificar emails como spam o legítimos.

In [None]:
spam_words = [
    "Click here now",
    "Limited time offer",
    "Free money today",
    "Act now immediately",
    "Call now"
]

ham_words = [
    "Hi how are you",
    "Meeting tomorrow",
    "Thanks for your help",
    "See you soon",
    "Have a great day"
]

all_texts = spam_words + ham_words
all_labels = [1]*len(spam_words) + [0]*len(ham_words)

vectorizer_spam = CountVectorizer(lowercase=True, stop_words='english')
X_spam = vectorizer_spam.fit_transform(all_texts)

nb_spam = MultinomialNB()
nb_spam.fit(X_spam, all_labels)

print(f"Accuracy en training: {nb_spam.score(X_spam, all_labels):.1%}")

new_emails = [
    "Click here for amazing offer",
    "Hi let's meet tomorrow"
]

X_new_emails = vectorizer_spam.transform(new_emails)
spam_pred = nb_spam.predict(X_new_emails)
spam_proba = nb_spam.predict_proba(X_new_emails)

print("\n" + "=" * 60)
print("PREDICCIONES DE SPAM")
print("=" * 60)

for email, pred, proba in zip(new_emails, spam_pred, spam_proba):
    status = "⚠️ SPAM" if pred == 1 else "✅ LEGÍTIMO"
    print(f"\n{email}")
    print(f"  {status}")
    print(f"  Confianza: {max(proba):.1%}")

---
## CONCLUSIONES

✅ **Ventajas:**
- Rápido de entrenar
- Funciona con pocos datos
- Interpretable
- Excelente para textos

❌ **Desventajas:**
- Asunción de independencia no siempre válida
- No captura relaciones complejas

📊 **Casos de uso:**
- Clasificación de textos
- Detección de spam
- Análisis de sentimientos
- Filtros de correo