In [1]:
import pandas as pd
df=pd.read_csv('SMSSpamcollection',
               sep='\t',names=["label","Message"])

In [2]:
df.head()

Unnamed: 0,label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.iloc[3]
df.shape

(5572, 2)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['label'], test_size=0.2,random_state=42)

In [6]:
X_train.head(),X_test.head()

(1978    Reply to win £100 weekly! Where will the 2006 ...
 3989    Hello. Sort of out in town already. That . So ...
 3935     How come guoyang go n tell her? Then u told her?
 4078    Hey sathya till now we dint meet not even a si...
 4086    Orange brings you ringtones from all time Char...
 Name: Message, dtype: object,
 3245    Squeeeeeze!! This is christmas hug.. If u lik ...
 944     And also I've sorta blown him off a couple tim...
 1044    Mmm thats better now i got a roast down me! i...
 2484        Mm have some kanji dont eat anything heavy ok
 812     So there's a ring that comes with the guys cos...
 Name: Message, dtype: object)

In [7]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [8]:
X_train_counts

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59296 stored elements and shape (4457, 7702)>

In [9]:
# Intialize and train model
model = MultinomialNB()
model.fit(X_train_counts, y_train)

In [10]:
# print(X_train_counts)

In [11]:
# Make prediction
y_pred = model.predict(X_test_counts)

# Evaluation
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Classification Report:]n",classification_report(y_test,y_pred))

confusion matrix:
 [[966   0]
 [  9 140]]
Accuracy: 0.9919282511210762
Classification Report:]n               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [12]:
 new_message = ["Claim your free vacation mow by clicking this link"]
# new_message = ["OM Prakash, receving loan of rs.5,00,000 on account today, "]
# new_message = ["application for leave"]
# model.predict('new_message')


In [13]:
new_message_counts = vectorizer.transform(new_message)

prediction = model.predict(new_message_counts)

print("Prediction:", prediction[0])

Prediction: spam


In [14]:
new_messages = [
    "Claim your free vacation now by clicking this link",
    "Hey, are we still on for the meeting tommoreow?",
    "Exclusivwe offer just for you.Buy Now"
]
# transform and predict
new_counts = vectorizer.transform(new_messages)
predictions= model.predict(new_counts)

for msg,label in zip(new_message, predictions):
    print(f"Message:'{msg}' ->Prediction: {label}")

Message:'Claim your free vacation mow by clicking this link' ->Prediction: spam


In [15]:
import joblib
joblib.dump(model,"model.joblib")

['model.joblib']

In [16]:
import joblib
joblib.dump(vectorizer,"scaled.joblib")

['scaled.joblib']

In [17]:
model_file=joblib.load("model.joblib")

In [18]:
scaled_file=joblib.load("scaled.joblib")

In [19]:
# Assuming you have already defined and fitted these objects
# scaled_file should be a CountVectorizer or similar text transformer
# model_file should be a classifier model (like RandomForest, SVM, etc.)

msg = ["application for leave"]

# First transform the text data using the vectorizer
data = scaled_file.transform(msg)

# Then use the transformed data for prediction with the model
pre = model_file.predict(data)  # Use model_file for prediction, not scaled_file

print(f"{msg} --> {pre[0]}")


['application for leave'] --> ham
