In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

<h3>Create bag of words representation using CountVectorizer</h3>

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
v = CountVectorizer()

In [10]:
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59496 stored elements and shape (4457, 7788)>

In [11]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7788,))

In [12]:
X_train_cv.shape

(4457, 7788)

In [13]:
v.get_feature_names_out()[1771]

'checking'

In [14]:
v.vocabulary_

{'night': 4793,
 'has': 3360,
 'ended': 2587,
 'for': 2948,
 'another': 981,
 'day': 2186,
 'morning': 4607,
 'come': 1923,
 'in': 3658,
 'special': 6399,
 'way': 7462,
 'may': 4412,
 'you': 7755,
 'smile': 6288,
 'like': 4125,
 'the': 6865,
 'sunny': 6654,
 'rays': 5622,
 'and': 961,
 'leaves': 4074,
 'your': 7759,
 'worries': 7653,
 'at': 1119,
 'blue': 1418,
 'bay': 1262,
 'gud': 3276,
 'mrng': 4632,
 'height': 3410,
 'of': 4910,
 'oh': 4931,
 'shit': 6132,
 'situation': 6230,
 'guy': 3292,
 'throws': 6929,
 'luv': 4279,
 'letter': 4101,
 'on': 4956,
 'gal': 3069,
 'but': 1593,
 'falls': 2767,
 'her': 3431,
 'brothers': 1543,
 'head': 3385,
 'whos': 7552,
 'gay': 3095,
 'beautiful': 1286,
 'truth': 7108,
 'against': 859,
 'gravity': 3240,
 'read': 5636,
 'carefully': 1680,
 'our': 5026,
 'heart': 3399,
 'feels': 2814,
 'light': 4122,
 'when': 7532,
 'someone': 6336,
 'is': 3766,
 'it': 3777,
 'very': 7325,
 'heavy': 3405,
 'good': 3194,
 'babe': 1195,
 'how': 3543,
 'goes': 3177,
 '

In [15]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7788,))

In [16]:
np.where(X_train_np[0]!=0)

(array([ 961,  981, 1119, 1262, 1418, 1923, 2186, 2587, 2948, 3276, 3360,
        3658, 4074, 4125, 4412, 4607, 4632, 4793, 5622, 6288, 6399, 6654,
        6865, 7462, 7653, 7755, 7759]),)

<h3>Train the naive bayes model</h3>

In [17]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [18]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [19]:
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       971
           1       0.97      0.95      0.96       144

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



### Test

In [20]:
emails = [
    "Hey Siddhesh, let's meet tommorrow?",
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

In [21]:
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [23]:
from joblib import dump
dump(model, 'spam_email_model.joblib')
print("Model saved successfully!")

Model saved successfully!


In [24]:
import streamlit

!streamlit run App.py

^C


In [25]:
# save_complete_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from joblib import dump

# Load data
df = pd.read_csv("spam.csv")
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df.Message, df.spam, test_size=0.2, random_state=42
)

# Create and fit vectorizer
print("Training vectorizer...")
vectorizer = CountVectorizer()
X_train_cv = vectorizer.fit_transform(X_train)
print(f"Vectorizer features: {len(vectorizer.get_feature_names_out())}")

# Train model
print("Training model...")
model = MultinomialNB()
model.fit(X_train_cv, y_train)

# Test to ensure it works
X_test_cv = vectorizer.transform(X_test)
accuracy = model.score(X_test_cv, y_test)
print(f"Model accuracy: {accuracy:.2%}")

# Save both model and vectorizer as a single file
model_data = {
    'model': model,
    'vectorizer': vectorizer,
    'feature_count': len(vectorizer.get_feature_names_out()),
    'accuracy': accuracy
}

dump(model_data, 'spam_detector_complete.joblib')
print("✅ Model and vectorizer saved successfully to 'spam_detector_complete.joblib'")
print(f"Features: {len(vectorizer.get_feature_names_out())}")

Training vectorizer...
Vectorizer features: 7701
Training model...
Model accuracy: 99.19%
✅ Model and vectorizer saved successfully to 'spam_detector_complete.joblib'
Features: 7701
