In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Replace special characters with nothing and apply lowercase in string fir each row
special_char = '@_!#$%^&*()<>?/\|}{~:.,;[]'
for i, row in df.iterrows():
    df.iloc[i, 1] = ''.join(x for x in df.iloc[i, 1] if not x in special_char)
    df.iloc[i, 1] = df.iloc[i, 1].lower()
df.head()

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don't think he goes to usf he lives arou...


In [5]:
# Get uniques words from all text in sms dataframe
all_words = []
for i, row in df.iterrows():
    list_words = df.iloc[i, 1].split(' ')
    all_words = all_words + list_words
all_words = pd.DataFrame(all_words)
print('n all words:', all_words.shape[0])
uniques_words = all_words[0].unique()
print('n uniques words:', uniques_words.shape[0])

n all words: 86961
n uniques words: 9877


In [6]:
# Create X df with the count for each word from the sms text
y = df.iloc[:,0]
X = pd.DataFrame(0, index=df.index, columns=uniques_words)
for i, row in df.iterrows() :
    list_words = row[1].split(' ')
    for word in list_words :
        X.loc[i,[word]] = X.loc[i,[word]] + [1 for col in X.columns if word == col]
X

Unnamed: 0,go,until,jurong,point,crazy,available,only,in,bugis,n,...,heap,lowes,salesman,å£750,087187272008,now1,pity,soany,suggestions,bitching
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,1,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
5570,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3900, 9877), (3900,), (1672, 9877), (1672,))

In [8]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [9]:
clf.predict_proba(X_test)

array([[9.99985635e-01, 1.43649976e-05],
       [9.99998570e-01, 1.42978149e-06],
       [9.89991029e-01, 1.00089713e-02],
       ...,
       [9.99998694e-01, 1.30633157e-06],
       [9.99991230e-01, 8.76999227e-06],
       [3.75767071e-04, 9.99624233e-01]])

In [10]:
y_pred = clf.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype='<U4')

In [11]:
scores = cross_val_score(clf, X, y, scoring="accuracy", cv=10)
scores

array([0.98387097, 0.97670251, 0.98025135, 0.97845601, 0.97486535,
       0.97486535, 0.97127469, 0.98025135, 0.96947935, 0.98204668])

In [12]:
accuracy_score(y_test, y_pred)

0.9754784688995215

In [13]:
confusion_matrix(y_test, y_pred)

array([[1430,   21],
       [  20,  201]])

## Trabajo en clase de ML - Cross Validation (4/3/22) 

In [14]:
sort_std_words = X.std().sort_values(ascending = False)
sort_std_words

               1.061846
you            0.771848
i              0.767253
to             0.721431
u              0.579473
                 ...   
dump           0.013397
pity           0.013397
soany          0.013397
suggestions    0.013397
bitching       0.013397
Length: 9877, dtype: float64

In [15]:
sort_std_words.index

Index(['', 'you', 'i', 'to', 'u', 'the', 'a', 'my', 'and', 'is',
       ...
       'now1', '087187272008', 'lowes', 'salesman', 'heap', 'dump', 'pity',
       'soany', 'suggestions', 'bitching'],
      dtype='object', length=9877)

In [16]:
X_top100_idx = sort_std_words[0:100]
X_top100_idx.index

Index(['', 'you', 'i', 'to', 'u', 'the', 'a', 'my', 'and', 'is', 'it', 'in',
       'me', 'for', 'your', 'of', 'ltgt', '2', 'ur', 'have', 'call', 'that',
       'do', 'on', 'are', 'be', 'can', 'will', 'if', 'now', 'so', 'not', 'but',
       'or', 'at', 'with', 'i'm', 'we', 'get', 'free', 'just', 'this', 'no',
       'when', 'happy', '4', 'was', 'up', 'd', 'go', 'he', 'all', 'from',
       'out', 'how', 'know', 'ok', 'like', 'what', 'good', 'come', 'she',
       'got', 'then', 'love', 'its', 'am', 'there', 'day', 'as', 'hi', 'time',
       'only', 'text', 'n', 'want', 'one', 'send', 'about', 'lor', 'by',
       'stop', 'r', 'her', 'ì', 'txt', 'going', 'need', 'don't', 'who',
       'sorry', 'mobile', 'they', 'da', 'reply', 'i'll', 'home', 'dont',
       'back', 'our'],
      dtype='object')

In [17]:
X_top100 = X[X_top100_idx.index]
X_top100

Unnamed: 0,Unnamed: 1,you,i,to,u,the,a,my,and,is,...,sorry,mobile,they,da,reply,i'll,home,dont,back,our
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,2,2,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5569,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
indx = np.arange(5572)
np.random.shuffle(indx)
indx

array([3443, 1625, 5078, ..., 3354, 5086, 1717])

In [19]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5, shuffle = True)
kf

KFold(n_splits=5, random_state=None, shuffle=True)

In [20]:
kf.get_n_splits(X_top100)

5