In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Load data
df = pd.read_csv("spam.csv")

# Display first 3 rows
print(df.head(3))

# Group by Category and describe
print(df.groupby("Category").describe())

# Create a binary column for spam
df["spam"] = df.Category.apply(lambda x: 1 if x == "spam" else 0)

# Drop the original Category column
df = df.drop(columns=["Category"], axis=1)

# Display first 3 rows of the new DataFrame
print(df.head(3))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, random_state=42)

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform training data
X_train_count = vectorizer.fit_transform(X_train.values)

# Display the first 3 rows of the transformed training data
print(X_train_count.toarray()[:3])

# Initialize and train the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_count, y_train)

# Display the model
print(model)

# Perform cross-validation
cv_scores = cross_val_score(model, X_train_count, y_train)
print(cv_scores)

# Transform the test data (only transform, not fit_transform)
X_test_count = vectorizer.transform(X_test.values)

# Evaluate the model on the test data
test_score = model.score(X_test_count, y_test)
print(test_score)

emails = [
    "Hey mohan, can we get together to watch football game tomorrow?",
    "Upto 20% discount on parking, exclusive offer just for you. Dont miss on the reward!"
]
emails_count = vectorizer.transform(emails)
model.predict(emails_count)

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
         Message                                                            \
           count unique                                                top   
Category                                                                     
ham         4825   4516                             Sorry, I'll call later   
spam         747    641  Please call our customer service representativ...   

               
         freq  
Category       
ham        30  
spam        4  
                                             Message  spam
0  Go until jurong point, crazy.. Available only ...     0
1                      Ok lar... Joking wif u oni...     0
2  Free entry in 2 a wkly comp to win FA Cup fina...     1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0

In [6]:
emails = [
    "Hey mohan, can we get together to watch football game tomorrow?",
    "Upto 20% discount on parking, exclusive offer just for you. Dont miss on the reward!"
]
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("multinomial", MultinomialNB())
])
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
predictions = pipeline.predict(emails)
predictions

array([0, 1], dtype=int64)