In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('spam.csv', \
                 sep=',', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [5]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1":"label", "v2":"text"})
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['y'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,text,y
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
print(df.shape)
print(df.label.value_counts())

(5572, 3)
ham     4825
spam     747
Name: label, dtype: int64


In [8]:
# tokenize the email and hashes the symbols into a vector
def extract_features(email, B=1024):
    # initialize all-zeros feature vector
    v = np.zeros(B)
    email = ' '.join(email)
    # breaks for non-ascii characters
    tokens = email.split()
    for token in tokens:
        v[hash(token) % B] = 1
    return v

In [9]:
def load_spam_data(extract_features, B=1024):
    
    xs = np.zeros((df.shape[0], B))
    
    labels = df['y'].values

    for i, email in enumerate(df['text']):
        xs[i, :] = extract_features(email, B)
    return xs

Xspam = load_spam_data(extract_features)
Xspam.shape
Yspam = df['y'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(Xspam, Yspam, test_size = 0.2, random_state = 10)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', penalty='l2')
model.fit(X_train, y_train)
pred = model.predict(X_test)

# evaluate training accuracy
scoresTr = model.predict(X_train)
trainingacc = accuracy_score(y_train, scoresTr)

# accuracy_score(X_train, scoresTr)
validationacc = accuracy_score(y_test, pred)
print("Training accuracy %2.2f%%\nValidation accuracy %2.2f%%\n" % (trainingacc*100,validationacc*100))

Training accuracy 98.56%
Validation accuracy 98.48%



In [12]:
import joblib

# Save the trained model to a file
model_filename = "spam_classifier_model.pkl"
joblib.dump(model, model_filename)


['spam_classifier_model.pkl']

In [13]:
import joblib

# Load the trained model from the file
model = joblib.load("spam_classifier_model.pkl")


In [17]:
# Function to predict if an email is spam or not
def predict_spam(email_text, model, B=1024):
    features = extract_features(email_text, B).reshape(1, -1)  # Extract and reshape features
    prediction = model.predict(features)  # Predict using the trained model
    if prediction[0] == 1:
        return "This email is spam."
    else:
        return "This email is not spam."

# Example usage
email_text = input("Enter the email text: ")
result = predict_spam(email_text, model)
print(result)


Enter the email text: When scammers send spammy messages that seem legit (but aren’t), they’re often trying to trick you into clicking links and giving them personal or financial information. Things like your passwords or bank account and Social Security numbers are valuable to scammers. With that access to your accounts, scammers could try to steal your money or your identity. Or both.  To help you cut down on spam and avoid scams:  Use filters. Your mobile phone probably has options to filter and block texts from unknown senders. Some wireless providers and call blocking apps can also help block unwanted messages. Many popular email providers (like Gmail) have strong spam filters turned on by default. But if any spam gets into your inbox, mark it as spam or junk. Protect your personal information. Before you enter personal information on a website, email, or text chain, stop. Ask yourself: Why do they need this information? And what’s going to happen to it? Remember, too: never share