# Naive Bayes

In [6]:
import pandas as pd
df=pd.read_csv("C:\\Users\\spoor\\Downloads\\spam.csv",encoding='latin-1')[['v1','v2']]
df.columns=['label','text']
df['label']=df['label'].map({'ham':0,'spam':1})
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Gaussian Naive Bayes (for continuous data)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

# Feature Engineering: Extract numerical features
df['text_length'] = df['text'].apply(len)
df['num_words' ] = df['text'].apply(lambda x: len(x.split()))
df['num_digits' ] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))

X=df[['text_length','num_words','num_digits']]
y=df['label']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

gnb=GaussianNB()
gnb.fit(X_train_scaled,y_train)
y_pred_gnb=gnb.predict(X_test_scaled)

In [12]:
from sklearn.metrics import *
accuracy = accuracy_score(y_test, y_pred_gnb)
print("Accuracy:", accuracy)

Accuracy: 0.9802690582959641


In [13]:
cm = confusion_matrix(y_test, y_pred_gnb)
print(cm)

[[953  12]
 [ 10 140]]


# Multinomial naive bayes (for text data)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Convert text data into word count vectors
vectorizer = CountVectorizer(stop_words='english')
X_counts = vectorizer. fit_transform(df['text'])

# Convert counts to TF-IDF representation
tfidf_transformer = TfidfTransformer() # it will calculate frequency of words
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)

# Train Multinomial Naïve Bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

In [18]:
accur = accuracy_score(y_test, y_pred_mnb)
print("Accuracy:", accur)

Accuracy: 0.968609865470852


In [19]:
cmtx= confusion_matrix(y_test, y_pred_mnb)
print(cmtx)

[[965   0]
 [ 35 115]]


# Bernoulli naive bayes (for binary features)

In [22]:
from sklearn.naive_bayes import BernoulliNB
# Convert text into binary presence/absence of important words
important_words = ["free", "win", "offer", "money", "urgent"]
for word in important_words:
    df[word] = df['text'].apply(lambda x: 1 if word in x.lower() else 0)

# Define features and target
X = df[important_words]
y = df ['label' ]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

In [23]:
acc = accuracy_score(y_test, y_pred_bnb)
print("Accuracy:", acc)

Accuracy: 0.895067264573991


In [24]:
cmtx= confusion_matrix(y_test, y_pred_bnb)
print(cmtx)

[[929  36]
 [ 81  69]]


Load the dataset and explore the message content and labels.  
Identify how many messages belong to each category.  
Build a probabilistic classification model that can learn patterns from word occurrences.  
Train the model using historical message data.  
Predict whether unseen messages are:  
Suspicious  
Not suspicious  
Test the system using custom user-entered messages.  
Evaluate the model using appropriate classification metrics.  
Analyze the following:  
How many suspicious messages were correctly identified?  
How many genuine messages were incorrectly flagged?  

In [25]:
df['label'].value_counts().rename({0: "Not Suspicious", 1: "Suspicious"})


label
Not Suspicious    4825
Suspicious         747
Name: count, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [28]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [29]:
y_pred = model.predict(X_test_tfidf)


In [30]:
def predict_message(msg):
    msg_tfidf = tfidf.transform([msg])
    prediction = model.predict(msg_tfidf)[0]
    return "Suspicious" if prediction == 1 else "Not Suspicious"

# Example tests
print(predict_message("Congratulations! You won a free prize"))
print(predict_message("Are we meeting tomorrow?"))


Suspicious
Not Suspicious


In [31]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9668161434977578


In [32]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[965   0]
 [ 37 113]]
