In [1]:
###  steps to solve problem
# 1->text preprocessing1(cleaning)
# 3-> train-test-split
# 4-> convert sentences in to vectors
# 5-> model traing

In [3]:
import pandas as pd
import numpy as np

In [4]:
messages = pd.read_csv('SMSSpamCollection.txt',sep='\t',names=['labels','message'])

In [5]:
messages

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
wd=WordNetLemmatizer()
ps= PorterStemmer()

In [8]:
review=re.sub('[^a-zA-Z]',' ',messages['message'][0])

In [None]:
review.lower()

###  Simple Preprocessing Steps
#### Tokenization: Splitting the text into individual words.
#### Lowercasing: Converting all characters to lowercase to ensure uniformity.
#### Removing Stop Words: Eliminating common words that do not contribute much meaning (e.g., "the", "and").
#### Stemming/Lemmatization: Reducing words to their root forms (optional).
#### Removing Punctuation and Special Characters: Cleaning up the text.

In [8]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i]) # remove all non-alphabatic characters from string
    review=review.lower()
    review=review.split()
    review=[wd.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)
    

In [9]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

## Spam Classification by using BOW(Back of Words) model

In [10]:
X=corpus

In [11]:
y=pd.get_dummies(messages['labels'],dtype=int)
y=y.iloc[:,1]


In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)


In [12]:
X_train

['reply win weekly fifa world cup held send stop end service',
 'hello sort town already dont rush home eating nacho let know eta',
 'come guoyang go n tell u told',
 'hey sathya till dint meet even single time saw situation sathya',
 'orange brings ringtones time chart hero free hit week go ringtones pic wap stop receiving tip reply stop',
 'sitting mu waiting everyone get suite take shower',
 'finish liao u',
 'urgent mobile awarded bonus caller prize nd attempt contact call box qu',
 'probably still going stuff',
 'wah lucky man save money hee',
 'hey u still gym',
 'oh lk tt den take e one tt end cine lor dun wan yogasana oso',
 'ok lor',
 'still havent collected dough pls let know go place sent get control number',
 'stupid possible',
 'u secret admirer reveal think u r special call opt reply reveal stop per msg recd cust care',
 'amazing rearrange letter give meaning dormitory dirty room astronomer moon starer eye see election result lie let recount mother law woman hitler eleven

In [14]:
# create bag of words model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(max_features=2500,binary=True,ngram_range=(1,2))

In [15]:
X_train=vectorizer.fit_transform(X_train).toarray()

In [16]:
X_test=vectorizer.transform(X_test).toarray()

In [17]:
# convert it in to proper shape by deafult in numpy we olny see 3 at starting and  3 at end so we set it(edgeitems=30)
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
#X_train

In [18]:
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [21]:
X_test

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [22]:
vectorizer.vocabulary_

{'reply': 1718,
 'win': 2399,
 'weekly': 2373,
 'world': 2436,
 'cup': 449,
 'send': 1821,
 'stop': 1985,
 'end': 581,
 'service': 1836,
 'send stop': 1827,
 'stop end': 1987,
 'hello': 902,
 'sort': 1938,
 'town': 2179,
 'already': 44,
 'dont': 536,
 'rush': 1762,
 'home': 935,
 'eating': 572,
 'let': 1131,
 'know': 1074,
 'let know': 1133,
 'come': 377,
 'go': 787,
 'tell': 2064,
 'told': 2155,
 'hey': 905,
 'till': 2125,
 'dint': 519,
 'meet': 1272,
 'even': 610,
 'single': 1879,
 'time': 2127,
 'saw': 1782,
 'situation': 1887,
 'orange': 1495,
 'brings': 206,
 'ringtones': 1746,
 'free': 705,
 'hit': 917,
 'week': 2365,
 'pic': 1550,
 'wap': 2341,
 'receiving': 1698,
 'reply stop': 1722,
 'sitting': 1886,
 'mu': 1364,
 'waiting': 2320,
 'everyone': 619,
 'get': 759,
 'suite': 2014,
 'take': 2041,
 'shower': 1863,
 'finish': 678,
 'liao': 1135,
 'urgent': 2268,
 'mobile': 1319,
 'awarded': 116,
 'bonus': 181,
 'caller': 263,
 'prize': 1636,
 'nd': 1387,
 'attempt': 102,
 'contact': 

In [31]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()

In [24]:
model.fit(X_train,y_train)

In [25]:
y_pred=model.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score,classification_report

In [27]:
score=accuracy_score(y_test,y_pred)

In [28]:
print(score)

0.9847533632286996


In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.95      0.94      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



## Spam Classifier by using TFIDF model 

In [12]:
X=corpus
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [13]:
## create a tfidf model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=2500,ngram_range=(1,2))

In [14]:
X_train= tfidf.fit_transform(X_train).toarray()

In [20]:
X_test=tfidf.transform(X_test).toarray()


AttributeError: 'csr_matrix' object has no attribute 'lower'

In [19]:
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [18]:
X_test

<1115x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 8140 stored elements in Compressed Sparse Row format>

In [17]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [159]:
y=pd.get_dummies(messages['labels'],dtype=int)

In [160]:
## create by ouptut feature by using ham
y=y.iloc[:,0]

In [161]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: ham, Length: 5572, dtype: int32

In [164]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [165]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.979372197309417


In [166]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92       149
           1       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

