## 4. Solution 1: Dynamic Tagging

Text Generation Using LSTM

https://www.kaggle.com/code/shivamb/beginners-guide-to-text-generation-using-lstms/notebook

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
# import keras.utils as ku 

In [3]:
import pickle
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/twitter_cleaned.pkl"
with open(fp,"rb") as f:
    twitter_cleaned = pickle.load(file=f)

twitter_cleaned.head()

Unnamed: 0,tweets_cleaned,neutral,hate
0,woman complain cleaning house man trash,1,0
1,boy dats cold tyga dwn bad cuffin dat hoe st...,0,1
2,dawg fuck bitch start cry confused shit,0,1
3,look like tranny,0,1
4,shit hear true faker bitch told ya,0,1


In [4]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [5]:
inp_sequences, total_words = get_sequence_of_tokens(twitter_cleaned.tweets_cleaned)
inp_sequences[:10]

[[202, 825],
 [202, 825, 2981],
 [202, 825, 2981, 141],
 [202, 825, 2981, 141, 18],
 [202, 825, 2981, 141, 18, 11],
 [95, 969],
 [95, 969, 366],
 [95, 969, 366, 1276],
 [95, 969, 366, 1276, 6327],
 [95, 969, 366, 1276, 6327, 17]]

In [6]:
total_words

20249

In [7]:
import numpy as np
# !pip install --upgrade tensorflow
import keras.utils.np_utils as ku
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1] # all words up to the second last word are used as features
    label = ku.to_categorical(label, num_classes=total_words) # convert the last word to a categorical var with total classes equal to total words
    return predictors, label, max_sequence_len

In [8]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [9]:
# saving train, cv and test processed data and labels
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Word Embedding Vectors/tokenizer_LSTM.pkl"
with open(fp,mode="wb") as f:
    pickle.dump(obj=(tokenizer),
                file=f)

In [10]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 10)            202490    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 20249)             2045149   
                                                                 
Total params: 2,292,039
Trainable params: 2,292,039
Non-trainable params: 0
_________________________________________________________________


Can try to test with larger variety of data? 

In [12]:
### run smaller epochs for faster results - 100 epochs takes almost 4 hours

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f653a0a1190>

In [13]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted,axis=1)       

        # predict_x=model.predict(X_test) 
        # classes_x=np.argmax(predict_x,axis=1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [14]:
import tensorflow as tf

## This file contains the model previously trained on 100 epochs
fp = "/content/drive/My Drive/BT4222 Group Project/Final Project/Codes/Jar of Pickles/LSTM_DynamicTagging"
new_model = tf.keras.models.load_model(fp)

In [17]:
print(generate_text("Yo", 5, new_model, max_sequence_len))

Yo Bitch Choosin Let Ha Ha


In [16]:
print(generate_text("You are", 5, new_model, max_sequence_len))

You Are Bitch Ass Nigga Square Times


In [18]:
print(generate_text("You are not", 5, new_model, max_sequence_len))

You Are Not Bitch Ass Nigga Square Times


In [19]:
print(generate_text("How are you so", 5, new_model, max_sequence_len))

How Are You So Bitch Ass Nigga Square Times


In [20]:
print(generate_text("You are so", 5, new_model, max_sequence_len))

You Are So Bitch Ass Nigga Square Times


In [21]:
print(generate_text("I am not a", 5, new_model, max_sequence_len))

I Am Not A Bitch Ass Nigga Square Times


In [22]:
print(generate_text("Hi how are you, why are you", 5, new_model, max_sequence_len))

Hi How Are You, Why Are You Bitch Dead Like Boss Bitch


In [23]:
print(generate_text("what do you mean, I don't want", 5, new_model, max_sequence_len))

What Do You Mean, I Don'T Want Bitch Pick Floor Drugs Turnt


After user types in a message, model automatically detects the next defined (K = number of relevant continuous words). Based on TF-IDF , we can set a threshold limit using n-grams or word frequency.

For example, if the top 10 words for the classified hate speech appears in the predicted phrase based on the sentence that the user has generated so far, a warning message will appear to the user - " If you post this message, and the message is tagged as a hate speech, you will be severely penalized" 