In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [11]:
dataset=pd.read_csv('SMSSpamCollection',sep='\t',names=['label','message'])

In [12]:
dataset.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# checking null values

dataset.isnull().sum()

label      0
message    0
dtype: int64

In [14]:
dataset.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# Data Preprocessing

In [15]:
dataset['message']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [16]:
corpus=[]
wordnet=WordNetLemmatizer()
for i in range(0,len(dataset)):
    #removing .,? from message column using regular expression
    review=re.sub('[^a-zA-Z]',' ',dataset['message'][i])
    #Converting Caps into small in message column
    review=review.lower()
    #splliting the data into words
    review=review.split()
    #lemmatizing
    review=[wordnet.lemmatize(word)for word in review if word not in stopwords.words('english')]
    #joining the sentence
    review=' '.join(review)
    corpus.append(review)
    print(len(review))   



82
23
101
35
35
77
45
109
97
92
68
81
92
105
11
115
13
49
26
111
20
28
28
66
34
85
21
62
34
40
45
85
15
35
94
62
27
16
21
74
50
65
98
6
43
24
30
26
47
88
28
89
58
147
97
46
93
24
12
27
49
40
16
35
87
83
74
110
38
20
28
14
34
13
6
24
42
23
13
30
16
9
17
9
13
27
65
91
25
37
36
109
117
68
87
101
31
24
108
24
36
52
43
109
115
33
63
18
59
65
21
20
16
65
90
104
90
86
60
54
88
87
30
102
23
30
10
113
4
74
13
4
32
21
86
130
25
9
25
93
49
10
16
113
18
14
31
99
47
17
24
21
40
19
109
198
16
13
59
83
85
55
33
33
98
116
39
86
54
45
42
39
68
17
72
38
35
18
82
30
70
91
11
34
9
23
82
19
78
23
60
51
106
27
58
42
26
18
40
70
15
15
62
124
23
15
50
90
26
56
24
40
16
19
11
16
45
27
34
15
67
20
98
16
90
70
46
106
29
117
17
81
23
23
34
109
30
38
9
95
75
39
42
39
112
25
30
13
25
77
114
39
141
0
92
21
46
29
10
95
35
3
55
22
83
11
22
30
115
26
93
34
98
104
29
19
5
24
15
19
38
15
83
18
4
77
36
2
93
72
126
24
40
12
9
42
104
42
90
38
46
64
102
39
16
104
19
19
51
96
20
138
78
57
83
18
24
89
25
96
17
33
34
81
96
7
24

In [17]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

### TF-IDF mdoel 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
#preparing input vector for model using tfidf method
tfidf=TfidfVectorizer(max_features=5000)
X=tfidf.fit_transform(corpus).toarray()

In [19]:
X.shape

(5572, 5000)

In [20]:
#preparing output fetaure 
y=pd.get_dummies(dataset['label'])
y=y.iloc[:,1].values
y.shape

(5572,)

## Model Creation

In [21]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)
print(X_train.shape,y_train.shape)

(4457, 5000) (4457,)


In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

spam_model=MultinomialNB().fit(X_train,y_train)

y_pred=spam_model.predict(X_test)

print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99       955
           1       1.00      0.84      0.91       160

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

