In [38]:
pip install pandas scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


1. Import the Required Libraries

In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re

# pandas – Used to load and handle your CSV dataset.
# train_test_split – To split your data into training and testing parts.
# TfidfVectorizer – Converts text (emails) into numbers that ML models understand.
# MultinomialNB – Naive Bayes classifier for text data.
# accuracy_score – To check how well your model is performing.
# re – Regular expressions, used for cleaning text (removing punctuation, etc.).

2. Load the CSV File

In [126]:
df = pd.read_csv("SpamDetection\emailspam.csv", encoding='latin-1')
# encoding='latin-1' is needed because utf-8 might give errors due to special characters in the file.

In [127]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [128]:
df = df[['v1', 'v2']].fillna(0) # Fill missing values with 0

# df[['v1', 'v2']]: Select only the two columns we care about: the label and the message.
# .dropna(): Removes rows where either of those two columns has missing values (if any).

In [129]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


3. Clean Unnecessary Columns

In [130]:
# df = df.iloc[:, :-3]
# :-3 means all columns except the last 3.


In [131]:
df.columns = ['label', 'text']
# label → will hold values like 'spam' or 'ham'
# text → holds the actual email messages



In [132]:
# 4. Convert Labels to Numbers
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [133]:
# 5. Check for NaN in the labels
print("NaN values in labels before fill:", df['label'].isna().sum())



NaN values in labels before fill: 0


In [134]:
# Fill NaN values in the label column with 0 (for 'ham') or 1 (for 'spam')
df['label'] = df['label'].fillna(0)

4. Convert Labels to Numbers

In [135]:
# df.loc[:, 'label'] = df['label'].map({'ham': 0, 'spam': 1})

# Machine learning needs numbers, not text.
# We convert:
# 'ham' → 0 (Not spam)
# 'spam' → 1 (Spam)

5. Text Cleaning Function

In [136]:
def clean_text(text):
    # Convert to lowercase
    text=text.lower()
    text=re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and symbols)
    text = re.sub(r'\d+', '', text)  # Remove numbers (optional, depending on context)
    return text


In [137]:
df.loc[:, 'clean_text'] = df['text'].apply(clean_text)  

# .copy() ensures we're working with a real copy, not a view (solves root cause).
# .loc[:, 'column'] = ... is the recommended way to assign new or modified columns.

# apply(clean_text) – Applies this function to every email in the dataset and creates a new column called clean_text.

# df['text'] →
# Refers to the column that contains the original email messages (spam or ham).

# .apply(clean_text) →
# Applies a function called clean_text() to each value in the 'text' column.
# So if you have 5000 rows, it runs clean_text() 5000 times — once for each message.

# df['clean_text'] = →
# Stores the cleaned version of each message into a new column called 'clean_text'.

7. Split Data

In [138]:
x = df['clean_text']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# x → Features (email texts)
# y → Labels (0 for ham, 1 for spam)
# train_test_split(...) splits the data:
# 80% for training
# 20% for testing
# random_state=42 ensures the split is the same every time you run it

In [139]:
# Check for NaN in y_train and y_test
print("NaN values in y_train after split:", y_train.isna().sum())
print("NaN values in y_test after split:", y_test.isna().sum())

NaN values in y_train after split: 0
NaN values in y_test after split: 0


In [140]:
print(y_train.head())  # See the first few rows of y_train
print(y_train.dtype)   # Check the data type of y_train

1978    0
3989    1
3935    0
4078    0
4086    1
Name: label, dtype: int64
int64


7. Convert Text into Numbers (Vectorization)

In [141]:
# # Convert labels to numeric if they are not already (e.g., 'Spam' -> 1, 'Not Spam' -> 0)
# y_train = y_train.map({'Spam': 1, 'Not Spam': 0})  # Adjust the mapping as needed
# y_test = y_test.map({'Spam': 1, 'Not Spam': 0})  # Adjust the mapping as needed

# # Now check the dtype again
# print(y_train.dtype)   # Should be 'int64' or 'int32'


In [142]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

# TfidfVectorizer turns text into numeric format using TF-IDF (term frequency–inverse document frequency).
# fit_transform(...) learns the vocabulary from training data and transforms it into numbers.
# transform(...) applies the same transformation to the test data.

In [143]:
y_test

3245    0
944     0
1044    1
2484    0
812     1
       ..
4264    0
2439    0
5556    0
4205    0
4293    1
Name: label, Length: 1115, dtype: int64

In [148]:
model = MultinomialNB()
model.fit(x_train_vec, y_train)

# MultinomialNB() creates a Naive Bayes model good for text data.
# .fit(...) trains the model using your transformed emails and labels.

9. Check Model Accuracy

In [149]:
y_pred = model.predict(x_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Uses the test data to see how well the model performs.
# predict(...) gives predictions for emails in the test set.
# accuracy_score(...) compares predicted values to the actual ones and prints accuracy.

Accuracy: 0.9748878923766816


10. Predict a New Email

In [147]:
new_email = ["You have WON a free iPhone, click here to claim now!"]

new_email_clean = [clean_text(new_email[0])]
new_email_vec = vectorizer.transform(new_email_clean)

new_pred = model.predict(new_email_vec)
print("Prediction:", "Spam" if new_pred[0] == 1 else "Not Spam")


Prediction: Spam


In [None]:

# # 1. Import the Required Libraries
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score
# import re

# # 2. Load the CSV File
# df = pd.read_csv("SpamDetection/emailspam.csv", encoding='latin-1')  # Adjust path as necessary
# df = df[['v1', 'v2']].fillna(0)  # Fill missing values with 0

# # 3. Clean Unnecessary Columns
# df.columns = ['label', 'text']  # Rename columns for clarity

# # 4. Convert Labels to Numbers
# df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# # 5. Check for NaN in the labels
# print("NaN values in labels before fill:", df['label'].isna().sum())

# # Fill NaN values in the label column with 0 (for 'ham') or 1 (for 'spam')
# df['label'] = df['label'].fillna(0)

# # 6. Text Cleaning Function
# def clean_text(text):
#     text = text.lower()  # Convert to lowercase
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and symbols
#     text = re.sub(r'\d+', '', text)  # Remove numbers (optional)
#     return text

# # Apply the clean_text function to the text column
# df['clean_text'] = df['text'].apply(clean_text)

# # 7. Split Data into Features and Labels
# x = df['clean_text']
# y = df['label']

# # 8. Split the Data into Training and Testing Sets
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# # 9. Check for NaN in y_train and y_test
# print("NaN values in y_train after split:", y_train.isna().sum())
# print("NaN values in y_test after split:", y_test.isna().sum())

# # 10. Convert Text into Numbers (Vectorization)
# vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
# x_train_vec = vectorizer.fit_transform(x_train)  # Fit and transform the training data
# x_test_vec = vectorizer.transform(x_test)  # Transform the test data

# # 11. Train the Naive Bayes Model
# model = MultinomialNB()
# model.fit(x_train_vec, y_train)

# # 12. Check Model Accuracy
# y_pred = model.predict(x_test_vec)
# print("Accuracy:", accuracy_score(y_test, y_pred))

# # 13. Predict a New Email
# new_email = ["You have WON a free iPhone lottery, click here to claim now!"]

# # Clean and vectorize the new email
# new_email_clean = [clean_text(new_email[0])]
# new_email_vec = vectorizer.transform(new_email_clean)

# # Predict the new email
# new_pred = model.predict(new_email_vec)
# print("Prediction:", "Spam" if new_pred[0] == 1 else "Not Spam")
