## Loading Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [4]:
!pip install wordcloud



In [5]:
from wordcloud import STOPWORDS

## Loading Data

In [6]:
df = pd.read_csv('spam.csv')

In [7]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


## Pre Processing Data : Dummies

In [9]:
dummies = pd.get_dummies(df['Category'],dtype=int)

In [10]:
dummies.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [11]:
df = pd.concat([df,dummies],axis = 'columns')

In [12]:
x_train,x_test,y_train,y_test = train_test_split(df['Message'],df['spam'],test_size=0.2)

## Pre Processing Text Data : Vectorization

In [41]:
text_feature_equal = TfidfVectorizer(stop_words = list(STOPWORDS))

In [26]:
text_feature_equal.analyzer

'word'

In [27]:
text_feature_equal.stop_words

["he'd",
 'ever',
 'like',
 "they'd",
 "haven't",
 'hence',
 'down',
 'should',
 'that',
 'ours',
 "shouldn't",
 'this',
 'www',
 'both',
 "weren't",
 'why',
 'it',
 'therefore',
 'be',
 'having',
 'however',
 'then',
 'over',
 'from',
 "you've",
 'else',
 'because',
 'on',
 "we've",
 'when',
 'so',
 "aren't",
 "what's",
 "they're",
 'doing',
 'a',
 'other',
 'between',
 'each',
 "they'll",
 'your',
 'were',
 'while',
 'my',
 "can't",
 "didn't",
 'you',
 'if',
 'once',
 'am',
 "it's",
 'nor',
 'since',
 'most',
 "when's",
 'as',
 'not',
 'and',
 'k',
 'again',
 'themselves',
 "you'd",
 'through',
 'there',
 'has',
 'him',
 'only',
 'her',
 'which',
 'above',
 "you're",
 'further',
 'off',
 'more',
 'yours',
 'can',
 "she's",
 "wouldn't",
 'shall',
 'an',
 "they've",
 "we'll",
 "wasn't",
 "we're",
 'cannot',
 "he'll",
 'been',
 'at',
 'to',
 "mustn't",
 'some',
 'was',
 "how's",
 'did',
 'ourselves',
 "couldn't",
 'she',
 "where's",
 'his',
 'about',
 'herself',
 'whom',
 'they',
 'just

In [36]:
text_feature_equal.max_df

1.0

In [37]:
text_feature_equal.min_df

1

In [39]:
x_train_equal_vector = text_feature_equal.fit_transform(x_train)

## Training Model

In [29]:
model_equal_vector = MultinomialNB()

In [30]:
model_equal_vector.fit(x_train_equal_vector,y_train)

In [31]:
emails1 = list(df.iloc[1:50,1])

In [32]:
emails_count = text_feature_equal.transform(emails1) #Not using fit because it is fitted already
len(emails_count.toarray()[1])

7624

In [33]:
model_equal_vector.predict(emails_count)

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0])

In [34]:
x_test_equal_vector = text_feature_equal.transform(x_test)

In [35]:
model_equal_vector.score(x_test_equal_vector,y_test)

0.9713004484304932

## Using Pipeline

In [47]:
pipe = Pipeline([('vectorizer',TfidfVectorizer()),('nb',MultinomialNB())],verbose=True)

In [46]:
pipe.named_steps

{'vectorizer': TfidfVectorizer(), 'nb': MultinomialNB()}

In [49]:
pipe.fit(x_train,y_train)

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing nb, total=   0.0s


In [51]:
pipe.score(x_test,y_test)

0.9641255605381166

In [52]:
pipe.predict(emails1)

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0])