### BOW,TF-IDF,ML Algorithms

1. Preprocessing and cleaning
3. Train Test Split
2. BOW and TF-IDF (sentence ---> vectors)
4. Train Models




In [1]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection.txt',sep='\t',names=['labels','message'])

In [2]:
messages.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Data Cleaning and Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Sandeep
[nltk_data]     Bandi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [5]:
corpus = []

for i in range(0,len(messages)):
    review = re.sub('^a-zA-Z',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus

['go jurong point, crazy.. avail bugi n great world la e buffet... cine got amor wat...',
 'ok lar... joke wif u oni...',
 "free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question(std txt rate)t&c' appli 08452810075over18'",
 'u dun say earli hor... u c alreadi say...',
 'nah think goe usf, live around though',
 "freemsg hey darl 3 week' word back! i'd like fun still? tb ok! xxx std chg send, £1.50 rcv",
 'even brother like speak me. treat like aid patent.',
 "per request 'mell mell (oru minnaminungint nurungu vettam)' set callertun callers. press *9 copi friend callertun",
 'winner!! valu network custom select receivea £900 prize reward! claim call 09061701461. claim code kl341. valid 12 hour only.',
 'mobil 11 month more? u r entitl updat latest colour mobil camera free! call mobil updat co free 08002986030',
 "i'm gonna home soon want talk stuff anymor tonight, k? i'v cri enough today.",
 'six chanc win cash! 100 20,000 pound txt> csh11 send 87

In [7]:
y = pd.get_dummies(messages['labels'],dtype=int)
y = y.iloc[:,1].values

In [9]:
#train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)

In [10]:
X_train

['repli win £100 weekly! 2006 fifa world cup held? send stop 87239 end servic',
 'hello. sort town already. . dont rush home, eat nachos. let know eta.',
 'come guoyang go n tell her? u told her?',
 'hey sathya till dint meet even singl time saw situat sathya.',
 'orang bring rington time chart heroes, free hit week! go rington & pic wap. stop receiv tip repli stop.',
 'sit mu wait everyon get suit take shower',
 'finish liao... u?',
 'urgent! mobil 07808726822 award £2,000 bonu caller prize 02/09/03! 2nd attempt contact you! call 0871-872-9758 box95qu',
 'probabl not, still go stuff',
 'wah lucki man... save money... hee...',
 'hey u still gym?',
 'oh... lk tt den take e one tt end cine lor... dun wan yogasana oso can...',
 'ok lor...',
 'still havent collect dough pl let know go place sent get control number',
 'stupid.it possibl',
 'u secret admirer. reveal think u r special. call 09065174042. opt repli reveal stop. 1.50 per msg recd. cust care 07821230901',
 'amaz : rearrang letter

In [11]:
## Create a Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500,ngram_range=(1,2))

In [12]:
#independent feature
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [13]:
X_train,X_test

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [15]:
cv.vocabulary_

{'repli': 1781,
 'win': 2412,
 '100': 28,
 'world': 2442,
 'cup': 573,
 'send': 1881,
 'stop': 2030,
 '87239': 148,
 'end': 704,
 'servic': 1894,
 'win 100': 2413,
 'send stop': 1887,
 'stop 87239': 2032,
 'hello': 990,
 'sort': 1989,
 'town': 2209,
 'already': 196,
 'dont': 656,
 'rush': 1827,
 'home': 1027,
 'eat': 692,
 'let': 1211,
 'know': 1158,
 'let know': 1212,
 'come': 510,
 'go': 865,
 'tell': 2098,
 'her': 993,
 'told': 2185,
 'hey': 996,
 'till': 2163,
 'dint': 639,
 'meet': 1340,
 'even': 721,
 'singl': 1932,
 'time': 2164,
 'saw': 1847,
 'orang': 1567,
 'bring': 359,
 'rington': 1806,
 'free': 796,
 'hit': 1011,
 'week': 2373,
 'pic': 1624,
 'wap': 2352,
 'receiv': 1760,
 'repli stop': 1786,
 'sit': 1938,
 'mu': 1428,
 'wait': 2330,
 'everyon': 728,
 'get': 841,
 'suit': 2057,
 'take': 2085,
 'shower': 1915,
 'finish': 772,
 'liao': 1214,
 'urgent': 2282,
 'mobil': 1384,
 'award': 267,
 '000': 2,
 'bonu': 337,
 'caller': 399,
 'prize': 1704,
 '02': 5,
 '03': 7,
 '2nd': 87

In [16]:
from sklearn.naive_bayes import MultinomialNB
spam_detect = MultinomialNB().fit(X_train,y_train)

In [17]:
y_pred = spam_detect.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [19]:
accuracy_score(y_test,y_pred)

0.9874439461883409

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [20]:
confusion_matrix(y_test,y_pred)

array([[962,   4],
       [ 10, 139]], dtype=int64)

In [None]:
#same way need to do for TF-IDF