In [68]:
import pandas as pd

In [69]:
df = pd.read_csv("spam.csv", encoding = "latin")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [70]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [71]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [72]:
df.rename(columns={'v1': 'lable', 'v2': "msg"}, inplace=True)

In [73]:
df.head()

Unnamed: 0,lable,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lable   5572 non-null   object
 1   msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [75]:
df.isna().sum()

lable    0
msg      0
dtype: int64

In [76]:
df['lable'].value_counts()

lable
ham     4825
spam     747
Name: count, dtype: int64

## Data Preprocessing

In [77]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [78]:
corpus = []
lm = WordNetLemmatizer()
for i in range (len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['msg'][i])
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    corpus.append(review) 

In [79]:
print(df['msg'][0])
len(df['msg'])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


5572

In [80]:
print(corpus[0])
len(corpus)

go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...


5572

In [81]:
df['msg']=corpus
df.head()

Unnamed: 0,lable,msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


## Model Building

### Vectorization (Convert Text Data Into The Vectors)

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [83]:
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(df['msg']).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
x_train_tfidf.shape

(5572, 8384)

### Data Splitting

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
X = df['msg']
y = df['lable']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

### Pipeline

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [89]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [90]:
text_mnb.fit(X_train,y_train)

In [91]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [92]:
#Accuracy Score on Testing Data
y_pred_test = text_mnb.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test,y_pred_test)*100)

Accuracy Score: 96.71052631578947


In [93]:
#Accuracy Score on Training Data
y_pred_train = text_mnb.predict(X_train)
print("Accuracy Score:",accuracy_score(y_train,y_pred_train)*100)

Accuracy Score: 98.0


In [94]:
#Confusion Matrix on Testing Data
y_pred_test = text_mnb.predict(X_test)
print("Confusion Matrix on Test Data:\n", confusion_matrix(y_test,y_pred_test))

Confusion Matrix on Test Data:
 [[1446    0]
 [  55  171]]


In [95]:
#Classification Report on Testing Data
y_pred_test = text_mnb.predict(X_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_test))

Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1446
        spam       1.00      0.76      0.86       226

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672



### Prediction on User_data

In [96]:
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return [review]

In [97]:
user_data = df['msg'][0]
print(user_data)
user_data = preprocess_data(user_data)
user_data

go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...


['go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...']

In [98]:
text_mnb.predict(user_data)[0]

'ham'

### predictiom class

In [99]:
class prediction:
    
    def __init__(self,data):
        self.data = data
        
    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]
    
    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()
        
        if text_mnb.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'
            
        else:
            return 'This Message is Ham'  

In [100]:
user_data = df['msg'][2]
print(user_data)
prediction(user_data).user_data_prediction()

free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's


'This Message is Spam'

In [101]:
user_data = df['msg'][3]
print(user_data)
prediction(user_data).user_data_prediction()

u dun say early hor... u c already say...


'This Message is Ham'

In [102]:
prediction("These messages usually start by saying that bad things will happen if you do not forward the email to a certain address. The scammers are looking to scare you and force you to respond so that they can gather your private data").user_data_prediction()

'This Message is Ham'