In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [None]:
text = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
print(f"Shape of text csv file >> {text.shape}")
text.sample(5)

In [None]:
text.Category.replace(["spam","ham"],[1,0],inplace=True)
sns.set_style("whitegrid")

def show_shape(data,showcount=False):
    print("="*40)
    print(data.shape)
    print(f"Number of Rows >> {data.shape[0]}")
    print(f"Number of Columns >> {data.shape[1]}")
    if(showcount==True):
        sns.countplot(data=data,x="Category")
        plt.show()
    
show_shape(text,showcount=True)

<1> Preprocess text data into padded sequences

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

data = text.Message
label = text.Category

tok= Tokenizer()
tok.fit_on_texts(data)
print(len(tok.index_word))
print("Let: word_size >> 3000")

word_size = 3000
tok = Tokenizer(num_words=word_size)
tok.fit_on_texts(data)
sequenced_data = tok.texts_to_sequences(data)
print("sequence of first sample >>",sequenced_data[0])


print(f"maximum sequence length >> {max(len(l) for l in sequenced_data)}")
print(f"minimum sequence length >> {min(len(l) for l in sequenced_data)}")
lengths = [len(l) for l in sequenced_data]
print(f"average sequence length >> {np.mean(lengths)}")
print(f"75% quantile of sequence lengths >> {np.quantile(lengths,0.75)}")

sequence_len = 25
print("\nlet: length of sequence be 25\n")


padded_data = pad_sequences(sequenced_data,maxlen=sequence_len,padding='post',truncating='post')
print("padded sequence of first two samples >>")
print(padded_data[0])
print(padded_data[1])

(2)Split train & test dataset

In [None]:
train_data,test_data,train_label,test_label = train_test_split(padded_data,label,random_state=42,test_size=0.2,stratify=label)
print(train_data.shape)
print(test_data.shape)

(3) Make a model and train and test

In [None]:
# Model without RNN nor LSTM
from keras.layers import Input,Dense,GlobalAveragePooling1D,Embedding

vocab_size = word_size +1

def create_simple_model(word_vec_size = 32,n_dense=1):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len)(X)
    H = GlobalAveragePooling1D()(H)
    if(n_dense ==2):
        H = Dense(word_vec_size)(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
model1 = create_simple_model(32,1)
model2 = create_simple_model(32,2)
model3 = create_simple_model(64,1)
model4 = create_simple_model(64,2)
models = [model1,model2,model3,model4]

from keras.callbacks import ReduceLROnPlateau
reduceLR = ReduceLROnPlateau(monitor='val_accuracy',factor=0.5,patience=3,verbose=0,min_lr=0.0001)

def train_and_test_model(model):
    hist = model.fit(train_data,train_label,epochs=10,batch_size=32,validation_split=0.2,verbose=0,callbacks=[reduceLR])
    print("="*15,"Results on Test dataset","="*15)
    ev = model.evaluate(test_data,test_label)
    
    return hist,ev

for model in models:
    train_and_test_model(model)

In [None]:
#Model with RNN/LSTM: many-to-many,stacked,bidirectional

from keras.layers import Bidirectional,SimpleRNN,LSTM,TimeDistributed

word_vec_size=32

def create_complex_model(layer='rnn',hidden_size=32,n_dense=1):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len)(X)
    if layer=='rnn':
        H = Bidirectional(SimpleRNN(hidden_size,return_sequences=True))(H)
        H = Bidirectional(SimpleRNN(hidden_size,return_sequences=True))(H)
    else:
        H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
        H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
        
    if n_dense==2:
        H = TimeDistributed(Dense(hidden_size))(H)
        Y = Dense(1,activation='sigmoid')(H)
    else:
        Y = TimeDistributed(Dense(1,activation='sigmoid'))(H)
        
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
# RNN models
rnn1 = create_complex_model('rnn',32,1)
rnn2 = create_complex_model('rnn',32,2)
rnn3 = create_complex_model('rnn',64,1)
rnn4 = create_complex_model('rnn',64,2)

rnns = [rnn1,rnn2,rnn3,rnn4]
for rnn in rnns:
    train_and_test_model(rnn)

In [None]:
# LSTM models
lstm1 = create_complex_model('lstm',32,1)
lstm2 = create_complex_model('lstm',32,2)
lstm3 = create_complex_model('lstm',64,1)
lstm4 = create_complex_model('lstm',64,2)

lstms = [lstm1,lstm2,lstm3,lstm4]
for lstm in lstms:
    train_and_test_model(lstm)
    
# best model hyper parameter: LSTM,bidirectional,many-to-many,double-stacked,hidden_size:64,n_dense:1
# test_acc: 0.9851, test_loss:0.1009

(4) Further improvements
- Add Dropout Layer

In [None]:
from keras.layers import Dropout

def create_drop_lstm(hidden_size=64):
    X = Input(shape=[sequence_len])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_len)(X)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.2)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.2)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
        
    Y = TimeDistributed(Dense(1,activation='sigmoid'))(H)
        
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
from keras.utils import plot_model

lstm_drop = create_drop_lstm()
train_and_test_model(lstm_drop)
plot_model(lstm_drop)

In [None]:
from keras.utils import plot_model

plot_model(lstm3)

<5> Test the model with self made test sets

In [None]:
mail = ["Hi my name is Jeong Hyeon Ho. I am now in incheon because of my military service. But guess what? It's less than 2 months left now! There are so many things to do when I finally finish my service. I just wanted to tell you that I just miss you so much.."]
sequenced_mail = tok.texts_to_sequences(mail)
padded_mail = pad_sequences(sequenced_mail,maxlen=sequence_len,padding='post',truncating='post')
print(padded_mail)
sample_data = padded_mail
print(sample_data.shape)

pred = model4.predict(sample_data)
print(pred)

In [None]:
mail = ["Congratulations! you have won a $1,000 Walmart gift card. Come visit this website! Don't miss your chance!"]
sequenced_mail = tok.texts_to_sequences(mail)
padded_mail = pad_sequences(sequenced_mail,maxlen=sequence_len,padding='post',truncating='post')
print(padded_mail)
sample_data = padded_mail
print(sample_data.shape)

pred = model4.predict(sample_data)
print(pred)

추가) Bayesian Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

data = text.Message
label = text.Category
print("data shape >> ",data.shape)

train_data,test_data,train_label,test_label = train_test_split(data,label,test_size=0.2,stratify=label)
print("train test split done now!")
print("train data shape >> ",train_data.shape)
print("test data shape >> ",test_data.shape)

vectorizer = CountVectorizer()
train_data = vectorizer.fit_transform(train_data)

transformer = TfidfTransformer()
train_data = transformer.fit_transform(train_data)
train_data = train_data.toarray()

def preprocess(data):
    result = vectorizer.transform(data)
    result = transformer.transform(result)
    return result

test_data = preprocess(test_data)
test_data = test_data.toarray()

print("Preprocessing done now!")
print(f"train dataset shape >> {train_data.shape}")
print(f"test dataset shape >> {test_data.shape}")

In [None]:
# Naive Bayes Classifier
# GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


model = GaussianNB()
def test(model):
    model.fit(train_data,train_label)
    train_pred = model.predict(train_data)
    train_acc = accuracy_score(train_label,train_pred)
    print("train_acc:",train_acc)

    test_pred = model.predict(test_data)
    test_acc = accuracy_score(test_label,test_pred)
    print("test_acc:",test_acc)
    return model

test(model)

In [None]:
# BernoulliNB : text data의 이진분류에서 좋은 성능
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
test(model)

In [None]:
#MultinomialNB
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
test(model)