<h2 align='center' color='orange'>E-Mail Spam Classification Using BOW</h2>

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [9]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [10]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [12]:
X_train.shape

(4457,)

In [13]:
X_test.shape

(1115,)

In [14]:
type(X_train)

pandas.core.series.Series

In [15]:
X_train[:4]

314     Hi the way I was with u 2day, is the normal wa...
1454                              Stupid.its not possible
422     Someone has contacted our dating service and e...
1111                              Ok ill tell the company
Name: Message, dtype: object

In [16]:
y_train[:4]

314     0
1454    0
422     1
1111    0
Name: spam, dtype: int64

In [18]:
from sklearn.feature_extraction.text import CountVectorizer    #converts the raw text data into a matrix of token counts (bag-of-words model)

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59029 stored elements and shape (4457, 7638)>

In [19]:
X_train_cv.shape

(4457, 7638)

In [22]:
v.get_feature_names_out().shape

(7638,)

In [40]:
dir(v)  #returns a list of all valid attributes and methods for that object.

['_CountVectorizer__metadata_request__fit',
 '_CountVectorizer__metadata_request__transform',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_r

In [42]:
from sklearn.naive_bayes import MultinomialNB   #Model used here for classification is Naive Bayes

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [32]:
X_test_cv = v.transform(X_test)

In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



<h3>See there is a very easy method to do all these steps that we did above to train our model</h3>

In [35]:
from sklearn.pipeline import Pipeline   #allows you to chain multiple data processing steps together.

#Creating the Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())    #applies the Multinomial Naive Bayes classifier, which is commonly used for text classification tasks.
])

In [38]:
clf.fit(X_train, y_train)  #here we fir out data into the pipeline

In [39]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

