In [27]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [7]:
data = pd.read_csv('./input_data/spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
source =data['v2']
type(source)

pandas.core.series.Series

In [10]:
source.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [11]:
target =data['v1']
type(target)

pandas.core.series.Series

In [13]:
target.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [14]:
target = pd.get_dummies(target, drop_first=True)
target.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [15]:
cv= CountVectorizer(stop_words='english')
cv

In [16]:
cv.fit(source)

In [17]:
cv.get_feature_names_out()

array(['00', '000', '000pes', ..., 'ûïharry', 'ûò', 'ûówell'],
      dtype=object)

In [21]:
list(cv.vocabulary_.keys())

['jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat',
 'ok',
 'lar',
 'joking',
 'wif',
 'oni',
 'free',
 'entry',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 '2005',
 'text',
 '87121',
 'receive',
 'question',
 'std',
 'txt',
 'rate',
 'apply',
 '08452810075over18',
 'dun',
 'say',
 'early',
 'hor',
 'nah',
 'don',
 'think',
 'goes',
 'usf',
 'lives',
 'freemsg',
 'hey',
 'darling',
 'week',
 'word',
 'like',
 'fun',
 'tb',
 'xxx',
 'chgs',
 'send',
 '50',
 'rcv',
 'brother',
 'speak',
 'treat',
 'aids',
 'patent',
 'request',
 'melle',
 'oru',
 'minnaminunginte',
 'nurungu',
 'vettam',
 'set',
 'callertune',
 'callers',
 'press',
 'copy',
 'friends',
 'winner',
 'valued',
 'network',
 'customer',
 'selected',
 'receivea',
 '900',
 'prize',
 'reward',
 'claim',
 '09061701461',
 'code',
 'kl341',
 'valid',
 '12',
 'hours',
 'mobile',
 '11',
 'months',
 'entitled',
 'update',
 'latest',
 

In [22]:
# Apply the vectorizer
cv_transformed = cv.transform(source)
cv_array= cv_transformed.toarray()

In [23]:
cv_array.shape

(5572, 8404)

In [24]:
from scipy import sparse

In [26]:
a0 = sparse.csr_matrix(cv_array[0])
print(a0)

  (0, 1051)	1
  (0, 1271)	1
  (0, 1701)	1
  (0, 1703)	1
  (0, 1994)	1
  (0, 2271)	1
  (0, 3494)	1
  (0, 3534)	1
  (0, 4224)	1
  (0, 4349)	1
  (0, 5741)	1
  (0, 8026)	1
  (0, 8227)	1


## Tf-IDF

In [29]:
tv = TfidfVectorizer(max_features=500, stop_words='english')
tv

In [31]:
tv.fit_transform(source)

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [32]:
tv.get_feature_names_out()

array(['000', '10', '100', '1000', '10p', '11', '12hrs', '150', '150p',
       '150ppm', '16', '18', '1st', '2000', '250', '2nd', '50', '500',
       '5000', '750', '800', '8007', '86688', 'able', 'abt', 'account',
       'actually', 'address', 'aft', 'afternoon', 'ah', 'aight',
       'alright', 'amp', 'angry', 'answer', 'apply', 'ard', 'ask',
       'asked', 'attempt', 'auction', 'available', 'await', 'award',
       'awarded', 'away', 'awesome', 'babe', 'baby', 'bad', 'beautiful',
       'bed', 'believe', 'best', 'better', 'big', 'birthday', 'bit',
       'bonus', 'book', 'bored', 'box', 'boy', 'bring', 'brother', 'bt',
       'bus', 'busy', 'buy', 'called', 'calling', 'calls', 'came',
       'camera', 'car', 'care', 'cash', 'cause', 'chance', 'change',
       'charge', 'chat', 'check', 'chikku', 'choose', 'claim', 'class',
       'close', 'club', 'code', 'collect', 'collection', 'colour', 'com',
       'come', 'comes', 'coming', 'congrats', 'contact', 'cool', 'cos',
       'cost', 

In [34]:
list(tv.vocabulary_.keys())[0:5]

['available', 'great', 'world', 'got', 'wat']

In [35]:
# Transform the data
tv_transformed = tv.transform(source)
tv_array= tv_transformed.toarray()

In [37]:
tv_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
tv_df = pd.DataFrame(tv_transformed.toarray(),
                    columns=tv.get_feature_names_out()).add_prefix('TFIDF')
tv_df

Unnamed: 0,TFIDF000,TFIDF10,TFIDF100,TFIDF1000,TFIDF10p,TFIDF11,TFIDF12hrs,TFIDF150,TFIDF150p,TFIDF150ppm,...,TFIDFya,TFIDFyeah,TFIDFyear,TFIDFyears,TFIDFyes,TFIDFyesterday,TFIDFyo,TFIDFyup,TFIDFì_,TFIDFìï
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.291468,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5568,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.620245,0.0
5569,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5570,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [39]:
examine_row = tv_df.iloc[0]
examine_row.sort_values(ascending=False).head()

TFIDFavailable    0.549238
TFIDFworld        0.496702
TFIDFwat          0.410286
TFIDFgreat        0.405632
TFIDFgot          0.344604
Name: 0, dtype: float64

## Tf-IDF và N-gram

In [41]:
tv_bi_gram_vec = TfidfVectorizer(ngram_range=(1,2),
                                stop_words='english')

In [42]:
tv_bi_gram = tv_bi_gram_vec.fit_transform(source)

In [43]:
tv_bi_gram_vec.get_feature_names_out()

array(['00', '00 easter', '00 sub', ..., 'ûò stick', 'ûówell',
       'ûówell û_'], dtype=object)

In [44]:
list(tv_bi_gram_vec.vocabulary_.keys())[0:5]

['jurong', 'point', 'crazy', 'available', 'bugis']

In [47]:
tv_df = pd.DataFrame(tv_bi_gram.toarray(),
                    columns=tv_bi_gram_vec.get_feature_names_out()).add_prefix('Count_')
tv_sums = tv_df.sum()

In [49]:
tv_sums

Count_00            1.540984
Count_00 easter     0.168684
Count_00 sub        0.838168
Count_00 subs       0.346519
Count_000           4.507586
                      ...   
Count_ûò limping    0.212502
Count_ûò sound      0.313509
Count_ûò stick      0.171044
Count_ûówell        0.271743
Count_ûówell û_     0.271743
Length: 37249, dtype: float64