<a href="https://colab.research.google.com/github/saakethk/machine-learning-research/blob/main/LSTMforNLPResearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Goal:** Implement LSTM for text classification

**Dataset:** https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset
- This dataset has two columns: text and spam
  - text: unprocessed text
  - spam: 0 is not spam and 1 is spam (unbalanced - more spam than not)

In [85]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

""" Make getting data easier """

def get_dataset(name: str, data_path: str, delimiter: str):
    # Get data, load into pandas, and return dataframe
    path = kagglehub.dataset_download(name)
    full_path = f"{path}/{data_path}"
    return pd.read_csv(full_path, on_bad_lines='skip', delimiter=delimiter) # The on bad lines just prevents reading corrupted vals

def get_dataset_partitioned(name: str, data_path: str, delimiter: str, x_column: str, y_column: str, test_ratio: float, random_state: int):
  # Gets data, splits data
  data = get_dataset(
      name=name,
      data_path=data_path,
      delimiter=delimiter
  )
  data.dropna(subset=[x_column, y_column], inplace=True)
  x_data = data[x_column]
  y_data = data[y_column]
  x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_ratio, random_state=random_state)
  return (x_train, x_test, y_train, y_test)

In [86]:
import numpy as np

""" Gets partitioned data """
x_train, x_test, y_train, y_test = get_dataset_partitioned(
    name="jackksoncsie/spam-email-dataset",
    data_path="emails.csv",
    delimiter=",",
    x_column="text",
    y_column="spam",
    test_ratio=0.3,
    random_state=67
)
print(x_train)
print(y_train)

Using Colab cache for faster access to the 'spam-email-dataset' dataset.
1245    Subject: peace tree designs : creating extraor...
1972    Subject: re : thursday night ' s dinner ( and ...
4058    Subject: re : marketpoint license agreement  d...
997     Subject: fantastic investors info  maisonette ...
1009    Subject: localized software , all languages av...
                              ...                        
5001    Subject: worth a careful reading  best regards...
2055    Subject: proposed bonuses  greg ,  these are p...
1738    Subject: re : check  julie ,  yes , this is ho...
4917    Subject: fwd : mark - to - market  return - pa...
2883    Subject: prob of default for e rating 7 as of ...
Name: text, Length: 4009, dtype: object
1245    1
1972    0
4058    0
997     1
1009    1
       ..
5001    0
2055    0
1738    0
4917    0
2883    0
Name: spam, Length: 4009, dtype: int64


**Embeddings:** Generate fasttext embeddings for the collection of spam emails and the words they contain.

In [87]:
!pip install gensim



In [88]:
from gensim.models import FastText
from gensim.test.utils import common_texts

""" Generate word embeddings """
corpus = [row.split() for row in x_train]
VECTOR_SIZE = 300
# Parameters: https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html
embed_model = FastText(sentences=corpus, vector_size=VECTOR_SIZE, window=5, min_count=1, workers=4, sg=1)

In [89]:
""" Test word embeddings """
word_embedding = embed_model.wv['computer']
print(embed_model.wv.most_similar("computer"))

[('compute', 0.9492530822753906), ('yourcomputer', 0.9334108829498291), ('computerworld', 0.9123769998550415), ('computers', 0.9047778844833374), ('computerized', 0.8694641590118408), ('komputera', 0.8630588054656982), ('computed', 0.8447462320327759), ('computable', 0.8189070820808411), ('compusa', 0.815635085105896), ('compiete', 0.7814040184020996)]


In [90]:
from tqdm import tqdm

""" Create embedding matrix """
word_index = embed_model.wv.key_to_index # Mapping from word to number
embedding_matrix = np.zeros((len(word_index) + 1, VECTOR_SIZE))
for word, i in tqdm(word_index.items()):
    embedding_matrix[i] = embed_model.wv[word]
print(embedding_matrix)

100%|██████████| 31655/31655 [00:00<00:00, 130834.47it/s]

[[ 0.02624542  0.09098431  0.09913829 ... -0.07776386  0.0741337
   0.10507029]
 [-0.07374353  0.09862339 -0.04311701 ... -0.03226464  0.00707446
   0.01338119]
 [-0.10507119  0.05690199 -0.00561032 ... -0.11492075  0.20002942
   0.03387672]
 ...
 [ 0.00631998 -0.01319264  0.19259138 ... -0.14614917  0.013761
   0.04925849]
 [ 0.09942157 -0.08933285  0.1657715  ... -0.02236006 -0.12663792
   0.04202405]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]





In [107]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

""" Apply embedding to training and testing set """
MAX_LEN = 1500
x_train_vect = [embed_model.wv.get_sentence_vector(row) for row in x_train]
x_train_pad = pad_sequences(x_train_vect, maxlen=MAX_LEN, padding='post', dtype="float32")
x_test_vect = [embed_model.wv.get_sentence_vector(row) for row in x_test]
x_test_pad = pad_sequences(x_test_vect, maxlen=MAX_LEN, padding='post', dtype="float32")
# print(x_train_vect)
# print(x_train_pad)

In [104]:
print((x_train_pad[0] > 0).tolist().count(False))

1500


**Model:** Defines and trains the model for evaluation

In [93]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense, Activation, Dropout

""" Build model """
# A simple LSTM with fasttext embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                  VECTOR_SIZE,
                  weights=[embedding_matrix],
                  trainable=False))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

In [108]:
""" Train model """
model.fit(x_train_pad, y_train, batch_size=32)

[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 3s/step - accuracy: 0.7541 - loss: 0.5575


<keras.src.callbacks.history.History at 0x7814426aa600>

In [109]:
from sklearn import preprocessing, decomposition, model_selection, metrics

""" Defines evalutation metric function """
def roc_auc(predictions,target):
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [110]:
""" Evaluate model """
predictions = model.predict(x_test_pad)
print(predictions)
print(len(predictions))
roc_auc(predictions, y_test)

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 706ms/step
[[0.17880285]
 [0.17880285]
 [0.17880285]
 ...
 [0.17880283]
 [0.17880283]
 [0.17880283]]
1719


np.float64(0.4922788605697151)