In [1]:
pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   -- ------------------------------------- 0.8/12.9 MB 5.6 MB/s eta 0:00:03
   ---- ----------------------------------- 1.6/12.9 MB 4.4 MB/s eta 0:00:03
   ------- -------------------------------- 2.4/12.9 MB 3.9 MB/s eta 0:00:03
   -------- ------------------------------- 2.9/12.9 MB 4.0 MB/s eta 0:00:03
   ----------- ---------------------------- 3.7/12.9 MB 3.8 MB/s eta 0:00:03
   ------------

In [2]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl (41.2 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# 1. Load the dataset 
# Example of dataset structure, it should contain a text column, and a label column.
# Where label is either "spam" or "ham"
data = pd.read_csv("spam.csv", encoding = 'latin1')
data = data[['v1','v2']]
data = data.rename(columns={'v1':'label','v2':'text'})
data['label'] = data['label'].map({'ham':0, 'spam':1})

In [5]:
# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

In [6]:
# 3. Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)


In [7]:
# 4. Train the model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_vectors, y_train)

In [8]:
# 5. Make predictions
y_pred = model.predict(X_test_vectors)


In [9]:
# 6. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9623318385650225
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [10]:
import joblib  # For saving and loading the model

In [13]:
def train_and_save_model():
    data = pd.read_csv("spam.csv", encoding='latin1')
    data = data[['v1', 'v2']]
    data = data.rename(columns={'v1': 'label', 'v2': 'text'})
    data['label'] = data['label'].map({'ham': 0, 'spam': 1})

    vectorizer = TfidfVectorizer()
    X_vectors = vectorizer.fit_transform(data['text'])
    y = data['label']

    model = MultinomialNB()
    model.fit(X_vectors, y)

    p=joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
    q=joblib.dump(model, 'spam_model.joblib')
    return p,q
