# DATA  ACQUISITION

In [3]:
import pandas as pd
import numpy as np

df=pd.read_csv("spam.csv")

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

# CLEAN_UP 

In [5]:
df['Spam']=df['Category'].apply(lambda x : 1 if x=='spam' else 0)

df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Preprocessing using spacy ,remove stop words to reduce the vactorizer  

In [8]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

In [9]:
len(STOP_WORDS)

326

In [11]:
nlp=spacy.load("en_core_web_sm")

In [12]:
def prepro(text):
    doc=nlp(text)
    array=[token.text for token in doc if not token.is_stop and not token.is_punct]
    return array

In [13]:
df.Message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [14]:
len(df.Message[0])

111

In [15]:
df['mes']=df['Message'].apply(lambda x : " ".join(prepro(x)))

In [16]:
df.head()

Unnamed: 0,Category,Message,Spam,mes
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy Available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,U dun early hor U c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah think goes usf lives


# BUILDING MODEL

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test=train_test_split(df.mes,df.Spam,test_size=0.1)

In [19]:
X_train.shape

(5014,)

In [20]:
X_test.shape

(558,)

# FEATURE ENGINEERING

In [21]:
from sklearn.feature_extraction.text import CountVectorizer                   # BAG OF WORDS

In [22]:
v=CountVectorizer()

In [23]:
X_train_cv=v.fit_transform(X_train.values)

In [24]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5014, 8005))

In [25]:
X_train_cv.shape

(5014, 8005)

In [26]:
v.get_feature_names_out().shape                   # NAMES OF ALL VOCABULARY WHICH IS COMES FROM THE PAST DATA

(8005,)

# FIT INTO MODEL

In [27]:
from  sklearn.naive_bayes import MultinomialNB                #  SKLEARN MAINLY USE IN ML PROJECTS

In [28]:
model = MultinomialNB()

In [29]:
model.fit(X_train_cv,y_train)

# EVALUATE

In [30]:
X_test_cv=v.transform(X_test)

In [31]:
from sklearn.metrics import classification_report

y_pre=model.predict(X_test_cv)

print(classification_report(y_test,y_pre))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       491
           1       0.98      0.97      0.98        67

    accuracy                           0.99       558
   macro avg       0.99      0.98      0.99       558
weighted avg       0.99      0.99      0.99       558



# USE PIPELINE 

In [32]:
from sklearn.pipeline import Pipeline

In [33]:
elf=Pipeline([
            ('vectorizer',CountVectorizer()),
            ('nb',MultinomialNB())
])

In [34]:
elf.fit(X_train,y_train)

In [35]:
y_pred= elf.predict(X_test)

In [36]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       491
           1       0.98      0.97      0.98        67

    accuracy                           0.99       558
   macro avg       0.99      0.98      0.99       558
weighted avg       0.99      0.99      0.99       558

