In [6]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

texts = [
    "buy cheap now",
    "limited offer buy now",
    "let's go to the park",
    "are you free tomorrow",
    "cheap deal just for you"
]
labels = [1, 1, 0, 0, 1]


def tokenize(text):
    return text.lower().split()


all_words = []
for sentence in texts:
    words = tokenize(sentence)
    for word in words:
        if word not in all_words:
            all_words.append(word)


word_to_index = {}
for index, word in enumerate(all_words):
    word_to_index[word] = index



def text_to_vector(text):
    vector = [0] * len(all_words)
    for word in tokenize(text):
        if word in word_to_index:
            position = word_to_index[word]
            vector[position] += 1
    return vector

X = []
for sentence in texts:
    X.append(text_to_vector(sentence))


X = np.array(X)
y = np.array(labels)

print(X)

model = MultinomialNB()
model.fit(X, y)

test_text = "buy now cheap offer"
test_vector = np.array([text_to_vector(test_text)])
prediction = model.predict(test_vector)

print("Test Text:", test_text)
print("Prediction:", "Spam" if prediction[0] == 1 else "Ham")

[[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1]]
Test Text: buy now cheap offer
Prediction: Spam


In [7]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

texts = [
    "buy cheap now",
    "limited offer buy now",
    "let's go to the park",
    "are you free tomorrow",
    "cheap deal just for you",
    "do you want to play football",
    "click here to win a prize",
    "meeting at 5 pm",
    "urgent offer limited time",
    "see you tomorrow at school"
]

labels = [1, 1, 0, 0, 1, 0, 1, 0, 1, 0]

# Tokenize
def tokenize(text):
    return text.lower().split()

# Build vocabulary
all_words = []
for sentence in texts:
    for word in tokenize(sentence):
        if word not in all_words:
            all_words.append(word)

word_to_index = {word: idx for idx, word in enumerate(all_words)}

# Convert text to vector
def text_to_vector(text):
    vector = [0] * len(all_words)
    for word in tokenize(text):
        if word in word_to_index:
            vector[word_to_index[word]] += 1
    return vector

# Vectorize data
X = np.array([text_to_vector(text) for text in texts])
y = np.array(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", round(acc * 100, 2), "%")


Accuracy on test set: 100.0 %


In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Load a subset of newsgroup categories to keep it simple
categories = ['sci.med', 'comp.graphics']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

texts = newsgroups.data
labels = newsgroups.target


# print("Article 1 (label:", newsgroups.target_names[labels[0]], "):")
# print(texts[0])
# print("\n" + "="*80 + "\n")

# # Print the second input text and its label
# print("Article 2 (label:", newsgroups.target_names[labels[1]], "):")
# print(texts[1])





def tokenize(text):
    return text.lower().split()


vocab = []
for text in texts:
    for word in tokenize(text):
        if word not in vocab:
            vocab.append(word)


word_to_index = {word: i for i, word in enumerate(vocab)}


def text_to_vector(text):
    vector = [0] * len(vocab)
    for word in tokenize(text):
        if word in word_to_index:
            idx = word_to_index[word]
            vector[idx] += 1
    return vector


X = np.array([text_to_vector(text) for text in texts])
y = np.array(labels)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


model = MultinomialNB()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", round(acc * 100, 2), "%")


Accuracy: 94.91 %
