# LSTM and Glove Embeddings

Works on Colab

In [55]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle

import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Dropout, Bidirectional, Flatten, Conv1D, GlobalMaxPool1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

from gensim.models.word2vec import Word2Vec
import gensim.downloader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Just using text

In [56]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

train = df_train.drop(columns=['id','keyword','location'])
test =  df_test.drop(columns=['id','keyword','location'])


In [57]:
def tweet_cleaner(tweet , remove_usernames = True):
  '''
  made for cleaning the tweet

  input: tweet: an uncleaned tweet with a 'string datatype
         remove_usernames: bool if usernames should be included or not. even if included, @ symbol is removed

  output: cleaned tweet with all stopwords removed

  '''
  #first remove usernames
  if remove_usernames:
    tweet = re.sub('@[^\s]+','',tweet)

  # remove urls
  tweet = re.sub('http[^\s]+','',tweet)
  tweet = re.sub('https[^\s]+','',tweet)
  tweet = re.sub('www[^\s]+','',tweet)

  # just capture words
  pattern = r'\b[a-zA-Z]+\b'

  # including new stopwords unique to tweets. and adding them to nltk
  stops = nltk.corpus.stopwords.words('english')
  new_stop_words = ["ha", "wa", "http", "s", "https", "com", "'s", "' s", "'ll", "' ll", "' d", "'d", "'re", "' re", "co", "amp", "url"]
  stops.extend(new_stop_words)

  # Gets list of words from re.findall() and filters out stop words and 1 letter words
  list_of_words = [x.lower() for x in re.findall(pattern, tweet) if (x not in stops) and (len(x)>1)]

  return ' '.join(list_of_words)



In [58]:
train['text_cleaned'] = train['text'].apply(tweet_cleaner)
test['text_cleaned'] = test['text'].apply(tweet_cleaner)
train.sample(5)

Unnamed: 0,text,target,text_cleaned
3973,@crabbycale OH MY GOD THE MEMORIES ARE FLOODIN...,0,oh my god the memories are flooding back
620,@O_Magazine satan's daughter shadow warrior in...,1,satan daughter shadow warrior women aka transg...
1257,I hope the only time I end up on TV is when I'...,1,hope time end tv arrested lighting buildings fire
6174,Yay for sirens,0,yay sirens
4216,DLH issues Hazardous Weather Outlook (HWO) htt...,1,dlh issues hazardous weather outlook hwo


## Using [this](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) to map text to vectors


In [59]:


# i want to make the tokenizer and embedding matrix have as much information.
# so I'm using all data availabke to do that.
tokenizer = Tokenizer(num_words=10_000)
tokenizer.fit_on_texts(train['text_cleaned'])
sequences =tokenizer.texts_to_sequences(train['text_cleaned'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=25) # determined this by finding the max len of a sequence
labels = np.asarray(train['target'])

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Found 13809 unique tokens.
Shape of data tensor: (7613, 25)
Shape of label tensor: (7613,)


In [61]:
len(word_index.keys())

13809

In [62]:

max([len(s) for s in sequences])


23

# Preparing the embedding layer

text file with glove embeddings gotten from [https://nlp.stanford.edu/projects/glove/](https://nlp.stanford.edu/projects/glove/)

In [63]:
embeddings_index = {} # creating a dictionary

glove_path = 'glove.twitter.27B.25d.txt'


with open(glove_path, encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


## ok now there we have an embeddings dictionary where the keys are the 1,193,514 unique words and the values are the 25 dimension vectors that each word is represented by

At this point we can leverage our embedding_index dictionary and our word_index to compute our embedding matrix:


In [64]:
num_words_in_glove = 0
embedding_dim= 25
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        num_words_in_glove +=1
print(f'Total number of words from Glove: {num_words_in_glove}')

Total number of words from Glove: 12326


In [65]:
embedding_matrix.shape

(13810, 25)

embedding_matrix is now filled with a mapping of words(tokens) from our corpus into the vectors from GloVe

# Keras LSTM

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    data, # padded sequences where the words are tokenized with tokenizer.word_index
    labels, # np.asarray(train['target'])
    random_state = 214,
    stratify=labels
    )

In [97]:
data.shape

(7613, 25)

In [34]:
len(word_index) + 1

13810

In [42]:
from sklearn.metrics.pairwise import kernel_metrics
model = Sequential()
model.add(Embedding(len(word_index) + 1, # input dim = max_words
                    embedding_dim, # output dim = dim of glove vectors
                    input_length=25, # input_length=max_sequence_length
                    weights=[embedding_matrix],
                    trainable=False)
)
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(Conv1D(filters = 128,kernel_size=3, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 25, 25)            345250    
                                                                 
 dropout (Dropout)           (None, 25, 25)            0         
                                                                 
 bidirectional_4 (Bidirecti  (None, 25, 256)           157696    
 onal)                                                           
                                                                 
 conv1d (Conv1D)             (None, 23, 128)           98432     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 flatten_2 (Flatten)         (None, 128)              

In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train,
          y_train,
          validation_data = (X_test, y_test),
          epochs=20,
          batch_size=32)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## ok, a 81.36% test accuracy and 79.45% train accuracy is good enough for streamlit app. I'll pickle this model, and tokenizer. and see if I can create the streamlit app

#### first, testing to see if model.predict works as expected for just 1 tweet

a function to clean tweets and put them in a format for the model to predict with

In [138]:
def tweet_to_input(tweet,tokenizer=tokenizer):
  '''
  Function that transforms asingle tweet(string) into an input for the model that was trained with a particular tokenizer
  input: tweet = single tweet that is a string
         tokenizer = tensorflow tokenizer used to train the model that we are getting predicitons from

  output: input array for model of shape (,max_padded_sequence_length)  aka (1,25) for this particular model

  requires:
            - tweet_cleaner() custom function
            - tensorflow.keras.preprocessing.Tokenizer object that was used in model training
            - pad_sequences() function from tensorflow.keras.preprocessing.sequence

  '''

  cleaned_tweet = list(map(tweet_cleaner,[tweet]))

  sequence = tokenizer.texts_to_sequences(cleaned_tweet)

  padded_array = pad_sequences(sequence, maxlen=25)

  return padded_array



In [142]:
inp = tweet_to_input("Israel-Hamas war rages as Palestinian death toll rises in Gaza: Live updates" ,tokenizer=tokenizer)

model.predict(inp)



array([[0.9823281]], dtype=float32)

In [143]:
with open('arjun_model_2.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('tokenizer_arjun_v1.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)
