In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [3]:
spam = pd.read_csv("spam.csv")

In [4]:
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam.shape

(5572, 2)

In [6]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [7]:
mensagens = spam['Message'].values

In [8]:
X_train, x_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)

In [9]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
token.fit_on_texts(x_test)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(x_test)

In [None]:
print(X_train)

[[301, 71, 10, 45, 3, 658, 52], [534, 60, 29, 74, 840, 1, 282, 67, 214, 390, 8, 5, 373, 7, 841, 555, 74, 72, 4], [195, 13, 97, 146, 118, 109, 336, 36, 4, 609, 153, 267, 16, 47, 659, 302, 129, 516, 686, 65, 413], [303, 301, 130, 147, 1, 796, 18, 3, 26, 8, 3], [3, 22, 295, 221, 31, 13, 97, 755, 47, 143, 187, 16, 633, 456, 610, 12, 65, 158], [718, 156, 77, 2, 198, 535, 80], [1, 108, 556, 75, 2, 21, 59], [391, 1, 226, 31, 536, 52, 634, 57, 14, 26, 196], [635, 191, 374, 337, 111, 9, 13, 16, 471, 375, 413, 16, 47, 901, 252, 660, 636], [3, 8, 13, 373, 1, 78, 4, 296], [50, 27, 247, 253, 199, 8, 110, 137, 5, 457, 842, 20, 199, 357, 351, 338, 7, 199, 5, 338], [687, 123, 22, 3, 80, 21], [84, 1, 222, 253, 3, 18, 40], [37, 10, 20, 3, 92, 175], [1, 62, 557, 88, 7, 1, 285, 2, 3, 71, 4, 94, 2, 392, 89, 47, 10, 188, 234, 131, 94, 85, 2, 318], [46, 82, 430, 1, 268, 39, 204, 31, 21, 176, 55, 365, 232, 154, 6, 248, 19, 49, 414, 41, 32, 4, 25, 902, 9, 472, 167, 21], [1, 62, 637, 15, 75, 243, 2, 12, 11, 263

In [10]:
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

In [11]:
X_train

array([[208,  25, 381, ...,   0,   0,   0],
       [ 53,  63, 146, ...,   0,   0,   0],
       [ 39,  22,   2, ...,   0,   0,   0],
       ...,
       [395, 305,  39, ...,   0,   0,   0],
       [  5, 148, 192, ...,   0,   0,   0],
       [ 45, 115,   1, ...,   0,   0,   0]], dtype=int32)

In [12]:
len(token.word_index)

9004

In [13]:
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index), output_dim=50, input_length=500))
modelo.add(Flatten())
modelo.add(Dense(units=10, activation='relu'))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1, activation='sigmoid'))

In [14]:
modelo.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [15]:
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           450200    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 10)                250010    
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 700221 (2.67 MB)
Trainable params: 700221 (2.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
modelo.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_train, y_train))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e4d5aa29600>

In [18]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao)

[[1.6817615e-12]
 [1.9252228e-07]
 [9.0182695e-04]
 ...
 [1.7589082e-05]
 [1.7716503e-10]
 [1.3110039e-09]]


In [19]:
prev = (nova_previsao > 0.5)
print(prev)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [20]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1428   11]
 [  19  214]]
