In [1]:
import pandas as pd 
messages=pd.read_csv('SpamClassifier-master/smsspamcollection/SMSSpamCollection', sep='\t',names=["label","message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
## Data cleaning and preprocessing 

import re 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Porter Stemmer

In [4]:
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [5]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA_Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [6]:
corpus

['jurong point crazi avail bugi n great world la e buffet ine got amor wat',
 'k lar oke wif u oni',
 'ree entri wkli comp win final tkt st ay ext receiv entri question std txt rate appli',
 'dun say earli hor c alreadi say',
 'ah think goe usf live around though',
 'ree sg ey darl week word back like fun still b ok x std chg send rcv',
 'ven brother like speak hey treat like aid patent',
 'per request ell ell ru innaminungint urungu ettam set callertun aller ress copi friend allertun',
 'valu network custom select receivea prize reward claim call laim code alid hour',
 'ad mobil month entitl pdate latest colour mobil camera ree obil pdate',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'chanc win rom pound txt send ost p day day sand appli epli info',
 'ou week membership rize ackpot xt word www dbuk net',
 'search right word thank breather promis wont take help grant fulfil promis ou wonder bless time',
 '',
 'obil ovi lub use credit click link next txt messa

## Snowball Stemmer

In [7]:
from nltk.stem import SnowballStemmer
snowballsstemmer=SnowballStemmer('english')

In [11]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA_Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review = [snowballsstemmer.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [12]:
corpus

['jurong point crazi avail bugi n great world la e buffet ine got amor wat',
 'k lar oke wif u oni',
 'ree entri wkli comp win final tkts st ay ext receiv entri question std txt rate appli',
 'dun say earli hor c alreadi say',
 'ah think goe usf live around though',
 'ree sg ey darl week word back like fun still b ok x std chgs send rcv',
 'ven brother like speak hey treat like aid patent',
 'per request ell ell ru innaminungint urungu ettam set callertun aller ress copi friend allertun',
 'valu network custom select receivea prize reward claim call laim code alid hour',
 'ad mobil month entitl pdate latest colour mobil camera ree obil pdate',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'chanc win rom pound txt send ost p day day sand appli epli info',
 'ou week membership rize ackpot xt word www dbuk net',
 'search right word thank breather promis wont take help grant fulfil promis ou wonder bless time',
 '',
 'obil ovi lub use credit click link next txt mes

## Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [18]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA_Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [19]:
corpus

['jurong point crazy available bugis n great world la e buffet ine got amore wat',
 'k lar oking wif u oni',
 'ree entry wkly comp win final tkts st ay ext receive entry question std txt rate apply',
 'dun say early hor c already say',
 'ah think go usf life around though',
 'ree sg ey darling week word back like fun still b ok x std chgs send rcv',
 'ven brother like speak hey treat like aid patent',
 'per request elle elle ru innaminunginte urungu ettam set callertune allers res copy friend allertune',
 'valued network customer selected receivea prize reward claim call laim code alid hour',
 'ad mobile month entitled pdate latest colour mobile camera ree obile pdate',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'chance win rom pound txt send ost p day day sand apply eply info',
 'ou week membership rize ackpot xt word www dbuk net',
 'searching right word thank breather promise wont take help granted fulfil promise ou wonderful blessing time',
 '',
 'obi

In [17]:
## create the bag of words

from sklearn.feature_extraction.text import CountVectorizer
# for Binary BOW enable binary=True 
cv=CountVectorizer(max_features=100,binary=True)

In [18]:
X=cv.fit_transform(corpus).toarray()

In [19]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## N-grams

In [21]:
cv.vocabulary_

{'great': 31,
 'got': 30,
 'wat': 94,
 'txt': 87,
 'say': 72,
 'alreadi': 0,
 'think': 81,
 'ey': 21,
 'week': 96,
 'back': 5,
 'like': 42,
 'still': 75,
 'ok': 58,
 'send': 74,
 'ell': 16,
 'friend': 25,
 'prize': 68,
 'claim': 8,
 'call': 6,
 'mobil': 51,
 'home': 35,
 'want': 93,
 'today': 83,
 'day': 13,
 'ou': 63,
 'xt': 99,
 'www': 98,
 'right': 70,
 'take': 77,
 'time': 82,
 'messag': 48,
 'es': 18,
 'make': 46,
 'way': 95,
 'feel': 22,
 'hat': 33,
 'dont': 14,
 'miss': 50,
 'ur': 88,
 'ox': 65,
 'go': 28,
 'tri': 85,
 'hen': 34,
 'da': 12,
 'lor': 43,
 'meet': 47,
 'ust': 90,
 'get': 26,
 'know': 39,
 'id': 37,
 'amp': 1,
 'us': 89,
 'ill': 38,
 'ut': 91,
 'ot': 62,
 'tell': 78,
 'anyth': 2,
 'repli': 69,
 'msg': 52,
 'see': 73,
 'ow': 64,
 'text': 79,
 'need': 54,
 'love': 44,
 'free': 24,
 'tomorrow': 84,
 'hope': 36,
 'lt': 45,
 'gt': 32,
 'ask': 3,
 'even': 20,
 'give': 27,
 'orri': 61,
 'later': 40,
 'pick': 67,
 'good': 29,
 'ts': 86,
 'come': 10,
 'said': 71,
 'end': 17,

In [34]:
## create the bag of words model with ngram

from sklearn.feature_extraction.text import CountVectorizer
# for Binary BOW enable binary=True 
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(3,3))
X=cv.fit_transform(corpus).toarray()

In [35]:
cv.vocabulary_

{'like lt gt': 33,
 'orri call later': 57,
 'call custom servic': 7,
 'custom servic repres': 17,
 'servic repres pm': 73,
 'guarante cash prize': 29,
 'draw show prize': 19,
 'special select receiv': 76,
 'live oper claim': 35,
 'account tatement show': 0,
 'dentifi ode xpire': 18,
 'onu aller rize': 56,
 'number end select': 48,
 'end select receiv': 22,
 'select receiv award': 72,
 'admir look make': 2,
 'look make contact': 36,
 'make contact find': 42,
 'contact find reveal': 15,
 'find reveal think': 26,
 'reveal think special': 69,
 'think special call': 82,
 'ongratul ur award': 55,
 'draw txt www': 20,
 'txt www dew': 87,
 'www dew com': 98,
 'anytim network min': 3,
 'cant pick phone': 10,
 'pick phone right': 64,
 'phone right ls': 63,
 'right ls send': 71,
 'ls send messag': 37,
 'ad mobil mth': 1,
 'cash balanc current': 12,
 'maxim ur cash': 43,
 'ur cash send': 91,
 'uit and ow': 89,
 'obil pd call': 50,
 'pd call optout': 61,
 'lt gt min': 39,
 'obil number award': 49,


In [36]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)