In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

pandas: Used to load and manipulate the CSV data (like an Excel sheet in Python).

train_test_split: A helper to separate our data into a "Study Guide" (Train) and an "Exam" (Test).

TfidfVectorizer: The translator that turns words into numbers (vectors).

SVC: The Support Vector Machine algorithm (the brain of our model).

Pipeline: A container that glues the translator and the brain together so we can use them as one unit.

In [43]:
df = pd.read_csv('mail.csv')
df = df.drop("id", axis = 1)
# Display the first 5 rows to check if it looks right
df.head()

Unnamed: 0,email,label
0,From: support@legitcompany.com\nSubject: Regar...,ham
1,From: noreply@softwareupdates.com\nSubject: We...,ham
2,From: noreply@softwareupdates.com\nSubject: Im...,ham
3,From: info@customerservice.co\nSubject: Team S...,ham
4,From: info@customerservice.co\nSubject: Team S...,ham


In [44]:

print("Original Columns:", df.columns)

# RENAME columns to match the code's expectation
# Based on your snippet, your text is in 'email' and verdict in 'label'
# We rename them to standard 'text' and 'label' for the rest of the code
df = df.rename(columns={'email': 'text', 'label': 'label'})

# Ensure we only have the two columns we need (dropping any index columns)
df = df[['text', 'label']]

# Check the data format again
print("\nProcessed Data Sample:")
df.head()

Original Columns: Index(['email', 'label'], dtype='object')

Processed Data Sample:


Unnamed: 0,text,label
0,From: support@legitcompany.com\nSubject: Regar...,ham
1,From: noreply@softwareupdates.com\nSubject: We...,ham
2,From: noreply@softwareupdates.com\nSubject: Im...,ham
3,From: info@customerservice.co\nSubject: Team S...,ham
4,From: info@customerservice.co\nSubject: Team S...,ham


In [45]:
# X is the input (the email text), y is the output (spam or ham)
X = df['text']
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"Training on {len(X_train)} messages.")
print(f"Testing on {len(X_test)} messages.")

Training on 9000 messages.
Testing on 1000 messages.


In [46]:
# Cell 4
# Create the pipeline: Vectorizer -> Classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svm', SVC(kernel='linear'))
])

# Train the model
print("Training model...")
model.fit(X_train, y_train)
print("Model training complete!")

Training model...
Model training complete!


In [47]:
# Cell 5
# Predict labels for the test set
predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Show detailed report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Model Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       844
        spam       1.00      1.00      1.00       156

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [48]:
# Cell 6
# Add your own messages to this list
my_messages = [
    "URGENT! You have won a 1 week FREE membership in our $100,000 Prize Jackpot!",
    "Hey man, are we still playing football tonight?"
]

# Get predictions
my_predictions = model.predict(my_messages)

# Print results
for message, prediction in zip(my_messages, my_predictions):
    print(f"Message: '{message}'  --->  Verdict: {prediction}")

Message: 'URGENT! You have won a 1 week FREE membership in our $100,000 Prize Jackpot!'  --->  Verdict: spam
Message: 'Hey man, are we still playing football tonight?'  --->  Verdict: ham
