In [21]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [22]:
import os
import json
import random

In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.optimizers import Adam

In [24]:
worksheet = gc.open('Carolyn SMS Data').sheet1
rows = worksheet.get_all_records()

In [7]:
dataset = pd.DataFrame.from_records(rows)[['Body', 'Class']]
dataset = dataset.dropna()
dataset = dataset[dataset['Class'] != '']
dataset = dataset[dataset['Class'] != 'skip']
dataset['Body'] = dataset['Body'].str.replace('#','0')

In [25]:
dataset.head

<bound method NDFrame.head of                                                    Body   Class  Class_encoded
0                                0 0 0 0 0 gb 0 apply 0    spam              1
1              0 0 0 0 data 0 gb 0 0 0 0 mb 0 0 myjio 0    spam              1
2                                  0 0 0 0 gb 0 0 fup 0    spam              1
3                        0 0 0 0 rail travel is insured    spam              1
4                0 0 0 data 0 gb 0 0 0 0 mb 0 0 myjio 0    spam              1
...                                                 ...     ...            ...
2282  dear 0 your passbook balance against 0 is rs c...  update              3
2283  dear customer 0 is the otp for your login to i...     otp              0
2284  0 is the one time password otp for join inaph ...     otp              0
2285  university of petroleum energy studies dehradu...  update              3
2286                                      lic agent 0 0  update              3

[1957 rows x 3 column

In [26]:
dataset.shape

(1957, 3)

In [27]:
dataset['Class'].value_counts()

spam           1138
update          345
transaction     309
otp             165
Name: Class, dtype: int64

In [28]:
encoder = LabelEncoder()
encoder.fit(dataset['Class'])
dataset['Class_encoded'] = LabelEncoder().fit_transform(dataset['Class'])

classes = encoder.classes_
print(classes)

['otp' 'spam' 'transaction' 'update']


In [29]:
tokenizer = Tokenizer(nb_words=3000, lower=True, split=' ')
tokenizer.fit_on_texts(dataset['Body'].values)

maxlen = int(np.mean([len(list(x.split(' '))) for x in dataset['Body']]))
print(maxlen)
# maxlen = 20

X = tokenizer.texts_to_sequences(dataset['Body'].values)
X = pad_sequences(X, padding='post', maxlen=maxlen)
X

23




array([[   1,    1,    1, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0],
       ...,
       [   1,    6,    9, ...,    0,    0,    0],
       [ 490,    8, 1046, ...,    0,    0,    0],
       [ 907,  396,    1, ...,    0,    0,    0]], dtype=int32)

In [30]:
vocab_size = (max(map(max, X))) + 1
print(vocab_size)

3000


In [31]:
Y = to_categorical(np.asarray(dataset['Class_encoded']), 4)
Y

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [32]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.20, random_state = random.randint(1, 1000))

In [33]:
batch_size = 32
embed_dim = 200

In [34]:
def create_lstm_model():
  lstm_out = 196
  model = Sequential()
  model.add(Embedding(vocab_size, embed_dim, input_length = X_train.shape[1]))
  model.add(LSTM(lstm_out))
  model.add(Dense(4, activation='softmax'))
  model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
  return model

In [35]:
def create_cnn_model():
  model = Sequential()
  model.add(Embedding(vocab_size, embed_dim, input_length = maxlen))
  model.add(Conv1D(128, 5, activation='relu',padding='same'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Conv1D(256, 5, activation='relu',padding='same'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Conv1D(512, 3, activation='relu',padding='same'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(4, activation='sigmoid'))
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  return model

In [36]:
def get_text_vector(texts):
  _texts = tokenizer.texts_to_sequences(texts)
  _texts = pad_sequences(_texts, padding='post', maxlen= maxlen)
  return _texts

In [37]:
def evaluate_and_get(model_getter, epochs=20):

  # get model
  model = model_getter()
  model.summary()

  # train
  model.fit(x=X_train, y=Y_train, batch_size=batch_size, epochs=epochs)

  # evaluate
  model.evaluate(X_valid, Y_valid, verbose=2, batch_size=batch_size)
  
  return model

In [38]:
lstm_model = evaluate_and_get(create_lstm_model, epochs=20)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 23, 200)           600000    
_________________________________________________________________
lstm (LSTM)                  (None, 196)               311248    
_________________________________________________________________
dense (Dense)                (None, 4)                 788       
Total params: 912,036
Trainable params: 912,036
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
13/13 - 0s - loss: 0.4898 - accuracy: 0.8648


In [39]:
sample = 'your google verification code is 0'
_text = get_text_vector([sample])

predictions = model.predict(_text)
print(sample, classes[np.argmax(predictions)])

NameError: ignored

In [41]:
cnn_model = evaluate_and_get(create_cnn_model, epochs=20)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 23, 200)           600000    
_________________________________________________________________
conv1d (Conv1D)              (None, 23, 128)           128128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 11, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 11, 256)           164096    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5, 256)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5, 512)            393728    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 512)           

In [44]:
sample = 'your google verification code is 0'
_text = get_text_vector([sample])

predictions = cnn_model.predict(_text)
print(sample, classes[np.argmax(predictions)])

your google verification code is 0 otp


Convert CNN model to TFLite, LSTMs won't work

In [45]:
converter = tf.lite.TFLiteConverter.from_keras_model(cnn_model)
tflite_model = converter.convert()

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /tmp/tmpazzp6c_l/assets


In [46]:
with open('model.tflite', 'wb') as f:
  f.write(tflite_model)

Evaluate TFLite model :D

In [48]:
interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_shape = input_details[0]['shape']

sample = 'your google verification code is 0'
_text = get_text_vector([sample])

interpreter.set_tensor(input_details[0]['index'], _text.astype(np.float32))

interpreter.invoke()
 
output_data = interpreter.get_tensor(output_details[0]['index'])
classes[np.argmax(output_data)]

'otp'

In [55]:
meta = {
    'classes': list(classes),
    'index': tokenizer.word_index
}

In [57]:
with open('meta.json', 'w') as f:
  f.write(json.dumps(meta))