In [107]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib


In [108]:
df = pd.read_csv('spam.csv', encoding='latin1')

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [111]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], axis=1)

In [112]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [113]:
df = df.rename(columns={'v1': 'Label', 'v2': 'Emails'})

In [115]:
emails = df['Emails']
labels = df['Label']

In [116]:
#preprocessing the data
emails = emails.str.lower()  # Convert to lowercase
emails = emails.replace('[^\w\s]', '', regex=True)  # Remove punctuation

In [117]:
emails = emails.apply(word_tokenize)
stop_words = set(stopwords.words('english'))
emails = emails.apply(lambda x: [word for word in x if word not in stop_words])

In [118]:
lemmatizer = WordNetLemmatizer()
emails = emails.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [119]:
df.head()

Unnamed: 0,Label,Emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [120]:
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)


In [121]:
X_train = [' '.join(email) for email in X_train]
X_test = [' '.join(email) for email in X_test]

In [122]:
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)


In [123]:
classifier = MultinomialNB()
classifier.fit(X_train_vectors, y_train)

In [124]:
y_pred = classifier.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9802690582959641


In [125]:
#example for testing our model

email_text = "Hello, this is a sample email. Please check it out."
email_count = vectorizer.transform([email_text])

In [126]:
joblib.dump(classifier, 'spam_detector_model.joblib')

['spam_detector_model.joblib']

In [127]:
loaded_model = joblib.load('spam_detector_model.joblib')

In [128]:
prediction = classifier.predict(email_count)
print("Prediction:", prediction[0])

Prediction: ham
