# Text Classification with Naive Bayes and Logistic Regression

This notebook demonstrates a text classification pipeline using a small dataset. The steps include data loading, preprocessing, model training, evaluation, and performance comparison.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## Load Dataset

In [None]:
# Example dataset
data = {
    "text": [
        "I love this product!",
        "This is the worst experience I've had.",
        "Absolutely fantastic service.",
        "Terrible quality, very disappointed.",
        "I will recommend this to my friends.",
        "Not worth the money.",
        "Excellent value for the price.",
        "Horrible experience, never again."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1: Positive, 0: Negative
}

df = pd.DataFrame(data)
print(df.head())

## Preprocess Data

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.3, random_state=42)

# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print("Vocabulary size:", len(vectorizer.vocabulary_))


## Train Classification Models

### Naive Bayes

In [None]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)

# Predictions and evaluation
nb_predictions = nb_model.predict(X_test_vectorized)
nb_f1 = f1_score(y_test, nb_predictions)
print("Naive Bayes F1 Score:", nb_f1)

### Logistic Regression

In [None]:
# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_vectorized, y_train)

# Predictions and evaluation
lr_predictions = lr_model.predict(X_test_vectorized)
lr_f1 = f1_score(y_test, lr_predictions)
print("Logistic Regression F1 Score:", lr_f1)

## Evaluate and Compare Models

### Classification Reports

In [None]:
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_predictions))

print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))

### Confusion Matrix

In [None]:
# Plot confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ConfusionMatrixDisplay.from_estimator(nb_model, X_test_vectorized, y_test, ax=axes[0], cmap="Blues")
axes[0].title.set_text('Naive Bayes')

ConfusionMatrixDisplay.from_estimator(lr_model, X_test_vectorized, y_test, ax=axes[1], cmap="Greens")
axes[1].title.set_text('Logistic Regression')

plt.tight_layout()
plt.show()

## Plot F1 Score Comparison

In [None]:
# Plot F1 scores for comparison
models = ['Naive Bayes', 'Logistic Regression']
f1_scores = [nb_f1, lr_f1]

plt.bar(models, f1_scores, color=['blue', 'green'])
plt.title("F1 Score Comparison")
plt.ylabel("F1 Score")
plt.ylim(0, 1)
plt.show()