In [28]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True,parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"


METADATA_EXPORT_PATH = EXPORT_DIR / "spam-metadata.pk1"

TOKENIZER_EXPORT_PATH = EXPORT_DIR / "spam-tokenizer.json"

In [3]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [4]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [5]:
labels[120],texts[120]

('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [6]:
label_legen = {'ham':0,'spam':1}
label_legen_inverted = {f"{v}": k for k,v in label_legen.items()}
label_legen_inverted

{'0': 'ham', '1': 'spam'}

In [7]:
labels_as_int = [label_legen[x] for x in labels]
#label_legen_inverted[str(labels_as_int[120])]

In [8]:
random_idx = random.randint(0, len(labels))


assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legen_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

2021-10-24 08:01:17.185250: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/oracle/instantclient_21_3:
2021-10-24 08:01:17.185293: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [10]:
MAX_NUM_WORDS = 280

In [11]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[55, 66, 10, 123, 143, 204, 169, 77, 68, 187],
 [64, 8],
 [59, 10, 25, 4, 2, 211, 95, 2, 2, 110, 104],
 [8, 182, 21, 8, 181, 185, 67, 182],
 [1, 121, 124, 80, 2, 80, 263, 118],
 [94,
  77,
  175,
  136,
  129,
  31,
  6,
  44,
  101,
  38,
  125,
  3,
  41,
  14,
  13,
  92,
  64,
  2,
  93,
  2],
 [208, 7, 9, 32, 38, 2, 40, 12, 113, 12, 38],
 [76, 212, 18, 120, 136, 76, 18, 14, 49, 2, 18, 276],
 [76, 4, 3, 20, 136, 2, 199, 2, 174, 26, 174, 66],
 [166, 18, 141, 37, 114, 8, 111, 2, 2, 5, 40, 14, 59, 26, 5, 141, 249, 59, 16],
 [42, 254, 33, 91, 245, 6, 1, 121, 79, 2, 89, 11, 135, 267, 96],
 [2, 211, 209, 47, 2, 196, 110, 6, 93, 2, 255, 78, 134, 56],
 [257,
  3,
  20,
  216,
  4,
  90,
  159,
  59,
  10,
  88,
  196,
  199,
  110,
  5,
  174,
  2,
  44,
  104,
  181,
  82],
 [267,
  136,
  14,
  5,
  170,
  2,
  195,
  3,
  14,
  11,
  1,
  1,
  107,
  18,
  177,
  14,
  6,
  35,
  7,
  3,
  20,
  136,
  6,
  4,
  34,
  49],
 [1, 20, 4, 16, 40, 35],
 [2, 18, 5, 10, 5, 246, 110, 186, 37, 

In [12]:
word_index = tokenizer.word_index
#word_index

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
MAX_SEQ_LENGTH = 300

In [15]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [16]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [17]:
labels_as_int_arrray = np.asanyarray(labels_as_int)
labels_as_int_arrray

array([0, 0, 1, ..., 1, 0, 0])

In [18]:
y = to_categorical(labels_as_int_arrray)

In [19]:
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [20]:
X, y

(array([[  0,   0,   0, ...,  77,  68, 187],
        [  0,   0,   0, ...,   0,  64,   8],
        [  0,   0,   0, ...,   2, 110, 104],
        ...,
        [  0,   0,   0, ..., 172, 117,  19],
        [  0,   0,   0, ..., 145, 202,  19],
        [  0,   0,   0, ..., 229,  37,  19]], dtype=int32),
 array([[1., 0.],
        [1., 0.],
        [0., 1.],
        ...,
        [0., 1.],
        [1., 0.],
        [1., 0.]], dtype=float32))

In [47]:
!pip install scikit-learn



In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
import pickle

In [29]:
training_data = {
    "X_train": X_train, 
    "X_test": X_test, 
    "y_train": y_train, 
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "label_legend": label_legen,
    "label_legend_inverted": label_legen_inverted
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [30]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)