In [1]:
%pip install pandas scikit-learn

Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp312-cp312-win_amd64.whl (11.0 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.3.3 pytz-2025.2 tzdata-2025.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
data = {
    'text': [
        "Win a free iPhone now! Click here",
        "Hey, are we still going for lunch?",
        "URGENT! You have won $1000 cash.",
        "Can you send me the report by 5pm?",
        "Free money!!! Claim your prize today.",
        " Mom called, she wants you to visit.",
        "Limited time offer! Buy 1 get 1 free.",
        "Let's meet at the park tomorrow."
    ],
    'label': [
        "spam", "ham", "spam", "ham", 
        "spam", "ham", "spam", "ham"
    ]
}

In [3]:
import pandas as pd 

df = pd.DataFrame(data)
df 

Unnamed: 0,text,label
0,Win a free iPhone now! Click here,spam
1,"Hey, are we still going for lunch?",ham
2,URGENT! You have won $1000 cash.,spam
3,Can you send me the report by 5pm?,ham
4,Free money!!! Claim your prize today.,spam
5,"Mom called, she wants you to visit.",ham
6,Limited time offer! Buy 1 get 1 free.,spam
7,Let's meet at the park tomorrow.,ham


In [5]:
df['label_num'] = df['label'].map({'ham':0,
                                   'spam':1})

df 

Unnamed: 0,text,label,label_num
0,Win a free iPhone now! Click here,spam,1
1,"Hey, are we still going for lunch?",ham,0
2,URGENT! You have won $1000 cash.,spam,1
3,Can you send me the report by 5pm?,ham,0
4,Free money!!! Claim your prize today.,spam,1
5,"Mom called, she wants you to visit.",ham,0
6,Limited time offer! Buy 1 get 1 free.,spam,1
7,Let's meet at the park tomorrow.,ham,0


In [7]:
import string 


def clean_text(text):
    text = text.lower()

    text = text.translate(str.maketrans('','',string.punctuation))

    return text


df['clean_text'] = df['text'].apply(clean_text)

df 

Unnamed: 0,text,label,label_num,clean_text
0,Win a free iPhone now! Click here,spam,1,win a free iphone now click here
1,"Hey, are we still going for lunch?",ham,0,hey are we still going for lunch
2,URGENT! You have won $1000 cash.,spam,1,urgent you have won 1000 cash
3,Can you send me the report by 5pm?,ham,0,can you send me the report by 5pm
4,Free money!!! Claim your prize today.,spam,1,free money claim your prize today
5,"Mom called, she wants you to visit.",ham,0,mom called she wants you to visit
6,Limited time offer! Buy 1 get 1 free.,spam,1,limited time offer buy 1 get 1 free
7,Let's meet at the park tomorrow.,ham,0,lets meet at the park tomorrow


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'],
    df['label_num'],
    test_size=0.25,
    random_state=42
)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [10]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

vectorizer.get_feature_names_out()

array(['1000', '5pm', 'at', 'buy', 'by', 'can', 'cash', 'claim', 'click',
       'free', 'get', 'have', 'here', 'iphone', 'lets', 'limited', 'me',
       'meet', 'money', 'now', 'offer', 'park', 'prize', 'report', 'send',
       'the', 'time', 'today', 'tomorrow', 'urgent', 'win', 'won', 'you',
       'your'], dtype=object)

In [11]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_vec, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [12]:
def predict_email(message):

    clean = clean_text(message)
    vec = vectorizer.transform([clean])

    prediction = model.predict(vec)

    if prediction[0] == 1:
        return "SPAM DETECTED"
    else:
        return "SAFE"

In [13]:
test_1 = "Congratulations! You won a free ticket to USA."

predict_email(test_1)

'SPAM DETECTED'

In [14]:
test_2 = "Hey Sneh, can we reschedule our meeting?"

predict_email(test_2)

'SAFE'