In [1]:
import pandas as pd

messages = pd.read_csv('spam.csv', sep=',', encoding="latin1")
messages = messages.iloc[:,:2]

messages = messages.rename(columns={'v1':'label','v2':'message'})
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

lemmitizer = WordNetLemmatizer()

In [7]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmitizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## TF-IDF with N_Grams

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2500, binary=True)

In [17]:
X = tfidf.fit_transform(corpus).toarray()
tfidf.vocabulary_

{'go': 855,
 'point': 1603,
 'crazy': 482,
 'available': 142,
 'bugis': 282,
 'great': 879,
 'world': 2443,
 'la': 1140,
 'cine': 384,
 'got': 870,
 'wat': 2359,
 'ok': 1486,
 'lar': 1152,
 'joking': 1085,
 'wif': 2402,
 'oni': 1494,
 'free': 791,
 'entry': 656,
 'wkly': 2428,
 'comp': 424,
 'win': 2408,
 'cup': 495,
 'final': 742,
 'tkts': 2182,
 'st': 2013,
 'may': 1304,
 'text': 2138,
 'receive': 1719,
 'question': 1674,
 'std': 2028,
 'txt': 2247,
 'rate': 1696,
 'apply': 97,
 'dun': 615,
 'say': 1819,
 'early': 620,
 'already': 64,
 'nah': 1413,
 'think': 2155,
 'usf': 2298,
 'life': 1195,
 'around': 113,
 'though': 2162,
 'freemsg': 793,
 'hey': 937,
 'darling': 519,
 'week': 2377,
 'word': 2439,
 'back': 160,
 'like': 1201,
 'fun': 813,
 'still': 2030,
 'tb': 2113,
 'xxx': 2469,
 'send': 1853,
 'rcv': 1699,
 'even': 670,
 'brother': 272,
 'speak': 1995,
 'treat': 2223,
 'per': 1553,
 'request': 1760,
 'melle': 1321,
 'oru': 1516,
 'minnaminunginte': 1342,
 'nurungu': 1472,
 'vet

In [13]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### NGrams

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(2,2))

X = tfidf.fit_transform(corpus).toarray()

tfidf.vocabulary_

{'ok lar': 1459,
 'free entry': 646,
 'entry wkly': 542,
 'wkly comp': 2433,
 'cup final': 412,
 'final tkts': 585,
 'std txt': 1941,
 'txt rate': 2198,
 'rate apply': 1665,
 'think go': 2041,
 'freemsg hey': 665,
 'like fun': 1142,
 'treat like': 2133,
 'per request': 1549,
 'request melle': 1733,
 'melle melle': 1263,
 'melle oru': 1264,
 'oru minnaminunginte': 1498,
 'minnaminunginte nurungu': 1292,
 'nurungu vettam': 1439,
 'vettam set': 2302,
 'set callertune': 1822,
 'callertune caller': 193,
 'caller press': 191,
 'press copy': 1633,
 'copy friend': 392,
 'friend callertune': 668,
 'winner valued': 2420,
 'valued network': 2300,
 'network customer': 1393,
 'reward claim': 1743,
 'claim call': 268,
 'call claim': 150,
 'claim code': 269,
 'code kl': 305,
 'valid hour': 2297,
 'entitled update': 539,
 'update latest': 2239,
 'latest colour': 1076,
 'free call': 641,
 'call mobile': 171,
 'mobile update': 1319,
 'update co': 2238,
 'co free': 293,
 'want talk': 2351,
 'chance win':