In [25]:
import pandas as pd

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
data=pd.read_csv('/content/drive/My Drive/My Courses/Python/Machine Learning/DataFrame/spam.csv',encoding='latin-1',usecols=['v1','v2'])

In [28]:
data.columns=['label','message']

In [29]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
data['label']=data['label'].map({'ham':0,'spam':1})

In [31]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
# split the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data['message'],data['label'],test_size=0.2,random_state=42)

In [33]:
data.shape

(5572, 2)

In [34]:
X_train.shape

(4457,)

In [35]:
X_test.shape

(1115,)

In [36]:
y_train.shape

(4457,)

In [37]:
y_test.shape

(1115,)

In [38]:
# transform text data to feature vectors
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
vectorizer=CountVectorizer(stop_words='english')
X_train_counts=vectorizer.fit_transform(X_train)
X_test_counts=vectorizer.transform(X_test)

In [40]:
X_test_counts

<1115x7472 sparse matrix of type '<class 'numpy.int64'>'
	with 7693 stored elements in Compressed Sparse Row format>

In [41]:
# train the model
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_counts,y_train)

In [42]:
# make prediction
from sklearn.metrics import accuracy_score,classification_report
y_pred=model.predict(X_test_counts)

In [43]:
y_pred

array([0, 0, 1, ..., 0, 0, 1])

In [44]:
y_test

Unnamed: 0,label
3245,0
944,0
1044,1
2484,0
812,1
...,...
4264,0
2439,0
5556,0
4205,0


In [45]:
# evaluate the model
accuracy=accuracy_score(y_pred,y_test)

In [46]:
accuracy

0.9838565022421525

In [47]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 98.39%

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [48]:
def predict_spam(message):
    vectorized_message = vectorizer.transform([message])
    prediction = model.predict(vectorized_message)
    return "Spam" if prediction[0] == 1 else "Not Spam"

# Test with custom input
print(predict_spam("Free Free Free! You got a trip to Malaysia."))
print(predict_spam("Hi, are we still meeting for lunch tomorrow?"))
print(predict_spam("Hi Ravi! are you free now?"))



Spam
Not Spam
Not Spam
