In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input,Dense,Flatten,GlobalAveragePooling1D,Embedding,SimpleRNN,LSTM
from sklearn.model_selection import train_test_split

raw_text = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding="latin1")
print(raw_text.shape)

plt.figure(figsize=(8,8))
sns.countplot(data=raw_text,x="v1")
plt.show()

raw_text.sample(5)

In [None]:
raw_text.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
raw_text['v1'] = raw_text['v1'].replace(['ham','spam'],[0,1])
raw_text.sample(5)

In [None]:
print(raw_text.info())
print(f"\nAny NA values? >> {raw_text.isnull().any()}")
print("\n",raw_text.v2.nunique())
raw_text.drop_duplicates(subset=['v2'],inplace=True)
print(f"Total rows >> {raw_text.shape[0]}")

In [None]:
data = raw_text.v2
label = raw_text.v1

WORD_SIZE = 5000

tokenizer = Tokenizer(num_words=WORD_SIZE)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
sequences[:2]

In [None]:
word_to_index = tokenizer.word_index
index_to_word = tokenizer.index_word

In [None]:
print(len(word_to_index))

vocab_size= WORD_SIZE+1
print(f"단어 집합의 크기:{vocab_size}")

max_len = max(len(l)for l in sequences)
print(f"최대 문장 길이(단어수):{max_len}")

sequence_size = 180
data =  pad_sequences(sequences,maxlen=sequence_size,padding='post',truncating='post')
print(data.shape)
data[:3]

In [None]:
# Train Test Split

train_data,test_data,train_label,test_label = train_test_split(data,label,stratify=label)
print(f"shape of train data >> {train_data.shape}")
print(f"shape of test data >> {test_data.shape}")

In [None]:
# Without RNN
word_vec_size=64

def create_simple_model():
    X = Input(shape=[sequence_size])
    H = Embedding(input_dim=vocab_size,output_dim=word_vec_size,input_length=sequence_size)(X)
    H = GlobalAveragePooling1D()(H)
    H = Dense(word_vec_size)(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

simple_model = create_simple_model()
hist = simple_model.fit(train_data,train_label,batch_size=32,validation_split=0.2,epochs=10)
evaluation = simple_model.evaluate(test_data,test_label)
simple_model.summary()


def plot_graph(hist):
    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(1,2,1)
    ax1.plot(range(len(hist.history['loss'])),hist.history['loss'],'bo--',label='train_loss')
    ax1.plot(range(len(hist.history['loss'])),hist.history['val_loss'],'ro--',label='val_loss')
    plt.legend()

    ax2 = fig.add_subplot(1,2,2)
    ax2.plot(range(len(hist.history['accuracy'])),hist.history['accuracy'],'bo--',label='train_acc')
    ax2.plot(range(len(hist.history['accuracy'])),hist.history['val_accuracy'],'ro--',label='val_acc')
    plt.legend()
    plt.show()

plot_graph(hist)

In [None]:
# With RNN (SimpleRNN)
hidden_size=64

def create_RNN_model():
    X = Input(shape=[sequence_size])
    H = Embedding(input_dim=vocab_size,output_dim=word_vec_size,input_length=sequence_size)(X)
    H = SimpleRNN(cell_size)(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

rnn_model = create_RNN_model()
hist = rnn_model.fit(train_data,train_label,batch_size=32,validation_split=0.2,epochs=10)
evaluation = rnn_model.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# With LSTM

def create_lstm_model():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = LSTM(hidden_size)(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

lstm = create_lstm_model()
hist = lstm.fit(train_data,train_label,epochs=5,batch_size=16,validation_split=0.2)
evaluation = lstm.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# many-to-many LSTM

def create_many2many_lstm_model():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = LSTM(hidden_size,return_sequences=True)(H)
    Y = keras.layers.TimeDistributed(Dense(1,activation='sigmoid'))(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

lstm = create_many2many_lstm_model()
hist = lstm.fit(train_data,train_label,epochs=8,batch_size=16,validation_split=0.2)
evaluation = lstm.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# many-to-many RNN

def create_many2many_rnn_model():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = SimpleRNN(hidden_size,return_sequences=True)(H)
    Y = keras.layers.TimeDistributed(Dense(1,activation='sigmoid'))(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

rnn = create_many2many_rnn_model()
hist = rnn.fit(train_data,train_label,epochs=8,batch_size=16,validation_split=0.2)
evaluation = rnn.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# stacked many-to-one LSTM

def create_stacked_simple_LSTM():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = LSTM(hidden_size,return_sequences=True)(H)
    H = LSTM(hidden_size,return_sequences=False)(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
    
    return model

lstm = create_stacked_simple_LSTM()
hist = lstm.fit(train_data,train_label,epochs=10,validation_split=0.2,batch_size=16)
evaluation = lstm.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# stacked many-to-many LSTM

def create_stacked_many_to_many_LSTM():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = LSTM(hidden_size,return_sequences=True)(H)
    H = LSTM(hidden_size,return_sequences=True)(H)
    Y = keras.layers.TimeDistributed(Dense(1,activation='sigmoid'))(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

lstm = create_stacked_many_to_many_LSTM()
hist = lstm.fit(train_data,train_label,epochs=10,validation_split=0.2,batch_size=16)
evaluation = lstm.evaluate(test_data,test_label)

plot_graph(hist)

In [None]:
# Bidirectional + stacked + many-to-many LSTM

from keras.layers import Bidirectional

def create_bi_stacked_model():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    Y = keras.layers.TimeDistributed(Dense(1,activation='sigmoid'))(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

lstm = create_bi_stacked_model()
hist = lstm.fit(train_data,train_label,epochs=6,validation_split=0.2,batch_size=16)
evaluation = lstm.evaluate(test_data,test_label)

plot_graph(hist)