# Spam Mail Detection 


In [1]:
# Import required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [10]:
# Load the dataset
# Dataset should have two columns: 'Category' (spam/ham) and 'Message'
data = pd.read_csv('mail_data (2).csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [13]:
data.nunique()

Category       2
Message     5157
dtype: int64

In [14]:
# Label encoding
# spam -> 0 , ham -> 1
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [20]:
#  Separate features and labels
X = data['Message']
Y = data['Category'].astype(int)

In [16]:
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=3
)

In [18]:
# Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [21]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_features, Y_train)


In [22]:
# Model evaluation
train_prediction = model.predict(X_train_features)
train_accuracy = accuracy_score(Y_train, train_prediction)

test_prediction = model.predict(X_test_features)
test_accuracy = accuracy_score(Y_test, test_prediction)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


Training Accuracy: 0.9676912721561588
Testing Accuracy: 0.9668161434977578


In [25]:
# Predict whether a mail is spam or not
input_mail = ["Congratulations! You have won a free gift card. Click here"]

input_data_features = vectorizer.transform(input_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This mail is NOT Spam (Ham)")
else:
    print("This mail is Spam")


This mail is Spam


In [27]:
# Predict whether a mail is spam or not
input_mail = ["Congratulations! "]

input_data_features = vectorizer.transform(input_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This mail is NOT Spam (Ham)")
else:
    print("This mail is Spam")


This mail is NOT Spam (Ham)


In [30]:
# Predict whether a mail is spam or not
input_mail = ["You are selected for a ₹10,000 cash prize. Claim today!"]

input_data_features = vectorizer.transform(input_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This mail is NOT Spam (Ham)")
else:
    print("This mail is Spam")


This mail is Spam


In [32]:
# Predict whether a mail is spam or not
input_mail = ["Win a brand new iPhone. Click the link to claim your reward."]

input_data_features = vectorizer.transform(input_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This mail is NOT Spam (Ham)")
else:
    print("This mail is Spam")


This mail is Spam
