In [22]:
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent

DATASET_DIR = BASE_DIR / "datasets"
ZIPS_DIR = DATASET_DIR / "zips"

ZIPS_DIR.mkdir(exist_ok=True, parents=True)

EXPORT_DIR = DATASET_DIR /"exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)

SPAM_DATASET_PATH = EXPORT_DIR /"spam-dataset.csv"


METADATA_EXPORT_PATH = EXPORT_DIR /"spam-metadata.pkl"

TOKENIZER_EXPORT_PATH = EXPORT_DIR /"spam-tokenizer.pkl"


In [2]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [3]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [4]:
label_legend= {"ham":0, "spam":1}
label_legend_inverted = {f"{v}":k for k,v in label_legend.items()}

In [5]:
labels_as_int = [label_legend[x] for x in labels]

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
MAX_NUM_WORDS = 280

In [8]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [9]:
word_index = tokenizer.word_index

In [10]:
# padding the sequences make them has same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
MAX_SEQ_LENGTH= 300

In [12]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
X

array([[  0,   0,   0, ...,  77,  68, 187],
       [  0,   0,   0, ...,   0,  64,   8],
       [  0,   0,   0, ...,   2, 110, 104],
       ...,
       [  0,   0,   0, ...,  15,   6, 137],
       [  0,   0,   0, ..., 180,  50,  50],
       [  0,   0,   0, ..., 190, 241,  19]], dtype=int32)

In [13]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [14]:
labels_as_int_array = np.asarray(labels_as_int)
y = to_categorical(labels_as_int_array)
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

### Split UND export

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [19]:
import pickle

training_data = {
    "X_train":X_train,
    "X_test":X_test,
    "y_train":y_train,
    "y_test":y_test,
    "max_words": MAX_NUM_WORDS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "legend": label_legend,
    "legend_inverted":label_legend_inverted
}

tokenizer_json = tokenizer.to_json()

In [23]:
with open(METADATA_EXPORT_PATH, "wb") as f:
    pickle.dump(training_data, f)

In [24]:
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335