In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import keras
from keras.layers import Input,Embedding,Conv1D,Dropout,GlobalMaxPooling1D,Dense
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
data.head()

In [None]:
data.Category = data.Category.str.replace("ham","0")
data.Category = data.Category.str.replace("spam","1")
data.Category = data.Category.astype("int")
data.dtypes

In [None]:
text = data.Message
label= data.Category

train_data,test_data,train_label,test_label = train_test_split(text,label,test_size=0.2,random_state=42)

print("train data shape >>",train_data.shape)
print("train label shape >>",train_label.shape)
print("test data shape >>",test_data.shape)
print("test label shape >>",test_label.shape)

In [None]:
sns.set_style("whitegrid")
sns.countplot(x=train_label)
plt.title("Distribution of label in train label")
plt.show()

In [None]:
sns.countplot(x=test_label)
plt.title("Distribution of label in test label")
plt.show()

In [None]:
tok = Tokenizer()
tok.fit_on_texts(train_data)
print("current word_size >> ",len(tok.word_index))

total_cnt = len(tok.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key,value in tok.word_counts.items():
    total_freq = total_freq+value
    if value <2:
        rare_cnt = rare_cnt+1
        rare_freq = rare_freq +value

print(f"Number of words used only one >> {rare_cnt}, Percentage >> {rare_cnt/total_cnt*100}%")
print(f"And those rare words accounts for {rare_freq/total_freq*100}% of total data")

In [None]:
word_size = 4000
vocab_size = word_size+1

tok = Tokenizer(num_words=word_size)
tok.fit_on_texts(train_data)
train_data = tok.texts_to_sequences(train_data)

In [None]:
lens = [len(s) for s in train_data]
print("max >> ",np.max(lens))
print("mean >> ",np.mean(lens))
print("median >> ",np.median(lens))

plt.hist(lens,bins=50)
plt.show()

In [None]:
sequence_size = 70
train_data = pad_sequences(train_data,maxlen=sequence_size)

In [None]:
test_data = tok.texts_to_sequences(test_data)
test_data = pad_sequences(test_data,maxlen=sequence_size)

print("train data shape >>",train_data.shape)
print("train label shape >>",train_label.shape)
print("test data shape >>",test_data.shape)
print("test label shape >>",test_label.shape)

In [None]:
word_vec_size= 256

def make_conv1D():
    X = Input(shape=[70])
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = Dropout(0.3)(H)
    H = Conv1D(256,3,activation='relu')(H)
    H = GlobalMaxPooling1D()(H)
    H = Dropout(0.3)(H)
    
    H = Dense(128,activation='relu')(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

es = EarlyStopping(monitor='val_loss',mode='min',patience=4,verbose=1)

In [None]:
conv1 = make_conv1D()
hist = conv1.fit(train_data,train_label,epochs=20,validation_split=0.1,batch_size=32,callbacks=[es])
ev = conv1.evaluate(test_data,test_label)