## Spam Detection

In [14]:
import pandas as pd 
messages = pd.read_csv("datasets/SMSSpamCollection", sep="\t", names=["label", "message"])

In [15]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Data Cleaning and Preprocessing

In [16]:
import re 
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
ps = PorterStemmer()

In [18]:
messages['message']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [19]:
corpus = [] 
for i in range(0, len(messages)):
    # apart from alphabets, replace with space
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    # lowercase all the words 
    review = review.lower()
    # split the words into list
    review = review.split()
    # stem the words which are not in stopwords
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

In [20]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

## Create Bag of Words Model

In [21]:
from sklearn.feature_extraction.text import CountVectorizer 
# cv = CountVectorizer(max_features = 100) # 100 most frequent words 

# for binary BOW
cv = CountVectorizer(max_features = 100, binary=True) # binary = true for Binary BOW 

In [22]:
X = cv.fit_transform(corpus).toarray()

In [23]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
X.shape # maximum features = 100

(5572, 100)

In [26]:
cv.vocabulary_

{'go': 22,
 'great': 25,
 'got': 24,
 'wat': 90,
 'ok': 56,
 'free': 18,
 'win': 94,
 'text': 77,
 'txt': 85,
 'say': 67,
 'alreadi': 0,
 'think': 80,
 'hey': 28,
 'week': 92,
 'back': 3,
 'like': 38,
 'still': 73,
 'send': 69,
 'even': 15,
 'friend': 19,
 'prize': 62,
 'claim': 7,
 'call': 4,
 'mobil': 47,
 'co': 8,
 'home': 30,
 'want': 89,
 'today': 82,
 'cash': 6,
 'day': 12,
 'repli': 64,
 'www': 96,
 'right': 65,
 'thank': 78,
 'take': 75,
 'time': 81,
 'use': 87,
 'messag': 44,
 'oh': 55,
 'ye': 97,
 'make': 42,
 'way': 91,
 'feel': 16,
 'dont': 14,
 'miss': 46,
 'ur': 86,
 'tri': 84,
 'da': 11,
 'lor': 39,
 'meet': 43,
 'realli': 63,
 'get': 20,
 'know': 33,
 'love': 40,
 'let': 37,
 'work': 95,
 'wait': 88,
 'yeah': 98,
 'tell': 76,
 'pleas': 61,
 'msg': 49,
 'see': 68,
 'pl': 60,
 'need': 51,
 'tomorrow': 83,
 'hope': 31,
 'well': 93,
 'lt': 41,
 'gt': 26,
 'ask': 1,
 'morn': 48,
 'happi': 27,
 'sorri': 72,
 'give': 21,
 'new': 52,
 'find': 17,
 'year': 99,
 'later': 35,
 'pi

## N-Gram

In [40]:
# Bag of Words with N-Gram
from sklearn.feature_extraction.text import CountVectorizer 

# cv = CountVectorizer(max_features = 100, ngram_range=(1,1)) 
cv = CountVectorizer(max_features = 500, ngram_range=(1,2)) 

In [41]:
X = cv.fit_transform(corpus).toarray()

In [42]:
cv.vocabulary_ 
# 'let know': 227,
# 'chanc win': 62,
# 'free call': 144,

{'go': 156,
 'point': 333,
 'great': 166,
 'world': 483,
 'got': 164,
 'wat': 463,
 'ok': 297,
 'lar': 216,
 'wif': 473,
 'free': 143,
 'entri': 127,
 'win': 475,
 'final': 136,
 'st': 401,
 'may': 253,
 'text': 416,
 'receiv': 354,
 'question': 345,
 'txt': 446,
 'rate': 347,
 'appli': 18,
 'dun': 116,
 'say': 368,
 'earli': 118,
 'alreadi': 9,
 'think': 422,
 'goe': 158,
 'live': 234,
 'around': 20,
 'though': 424,
 'hey': 188,
 'week': 466,
 'word': 481,
 'back': 29,
 'like': 230,
 'fun': 150,
 'still': 404,
 'xxx': 490,
 'send': 376,
 'even': 128,
 'brother': 46,
 'speak': 399,
 'per': 314,
 'set': 380,
 'friend': 145,
 'network': 281,
 'custom': 88,
 'select': 374,
 'prize': 340,
 'claim': 68,
 'call': 51,
 'code': 74,
 'valid': 453,
 'hour': 195,
 'mobil': 264,
 'month': 267,
 'updat': 448,
 'latest': 220,
 'colour': 76,
 'camera': 55,
 'co': 72,
 'free call': 144,
 'gonna': 160,
 'home': 192,
 'soon': 395,
 'want': 462,
 'talk': 413,
 'stuff': 407,
 'tonight': 435,
 'enough': 12

In [None]:
# Bag of Words with N-Gram
from sklearn.feature_extraction.text import CountVectorizer 

# cv = CountVectorizer(max_features = 100, ngram_range=(1,1)) 
cv = CountVectorizer(max_features = 500, ngram_range=(2,2)) 
X = cv.fit_transform(corpus).toarray()

cv.vocabulary_ 
# 'free entri': 125,
# 'rate appli': 336,
# 'per request': 306,

{'free entri': 125,
 'rate appli': 336,
 'per request': 306,
 'claim call': 53,
 'call claim': 23,
 'claim code': 54,
 'entitl updat': 111,
 'updat latest': 437,
 'latest colour': 216,
 'free call': 123,
 'call mobil': 31,
 'mobil updat': 266,
 'updat co': 436,
 'chanc win': 52,
 'win cash': 481,
 'repli hl': 344,
 'hl info': 187,
 'txt word': 431,
 'dont miss': 100,
 'ha ha': 176,
 'let know': 222,
 'feel like': 116,
 'mobil charg': 261,
 'repli ye': 346,
 'go home': 147,
 'anyth lor': 3,
 'call repli': 35,
 'nokia mobil': 285,
 'mobil free': 263,
 'free camcord': 124,
 'pleas call': 317,
 'deliveri tomorrow': 94,
 'lt gt': 239,
 'miss call': 258,
 'want go': 465,
 'first time': 121,
 'like lt': 226,
 'sm ac': 374,
 'bx ip': 20,
 'sorri call': 377,
 'call later': 29,
 'later meet': 213,
 'ok come': 295,
 'award bonu': 8,
 'prize call': 332,
 'ur award': 438,
 'hi babe': 184,
 'call free': 25,
 'that cool': 408,
 'hi hi': 186,
 'wen ur': 479,
 'call custom': 24,
 'custom servic': 84,
 

In [45]:
# Bag of Words with N-Gram
from sklearn.feature_extraction.text import CountVectorizer 

# cv = CountVectorizer(max_features = 100, ngram_range=(1,1)) 
cv = CountVectorizer(max_features = 500, ngram_range=(3,3)) 
X = cv.fit_transform(corpus).toarray()

cv.vocabulary_ 

{'free entri wkli': 141,
 'std txt rate': 388,
 'txt rate appli': 428,
 'set callertun caller': 363,
 'callertun caller press': 52,
 'caller press copi': 50,
 'press copi friend': 328,
 'copi friend callertun': 85,
 'call claim code': 32,
 'entitl updat latest': 118,
 'updat latest colour': 434,
 'free call mobil': 137,
 'call mobil updat': 42,
 'mobil updat co': 257,
 'updat co free': 433,
 'chanc win cash': 63,
 'repli hl info': 341,
 'like lt gt': 207,
 'bx ip pm': 30,
 'sorri call later': 376,
 'call later meet': 40,
 'pleas call custom': 313,
 'call custom servic': 33,
 'custom servic repres': 98,
 'servic repres pm': 362,
 'pm guarante cash': 317,
 'guarante cash prize': 164,
 'po box mk': 320,
 'tri contact last': 422,
 'contact last weekend': 82,
 'last weekend draw': 199,
 'weekend draw show': 471,
 'draw show prize': 113,
 'show prize guarante': 365,
 'prize guarante call': 335,
 'guarante call claim': 162,
 'claim code valid': 66,
 'valid hr ppm': 453,
 'enter cabin pa': 117