# Session 8 - Language modelling with RNNs (Text Generation)

Can we make produce **headlines** from **NY times headlines**????? :))

In [2]:
# data processing tools 
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

2023-03-22 15:17:32.190477: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Some helper functions
Predefined functions that Ross made.

In [2]:
#standard coding practice
def clean_text(txt): #for each text
    txt = "".join(v for v in txt if v not in string.punctuation).lower() #return vocab if it is not a part string.punctuation (str.punc== alle dumme tegn), essentially slet alle dumme tegn og lowercase
    txt = txt.encode("utf8").decode("ascii",'ignore') #make encoding  utf8 :))
    return txt  # NOTE this will return nonsense headings at times, since some headlines are connected to questions

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential() #sequential model, den tager ord for ord 
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words,  # creating embedding about each token, learned while training  
                        10,  #small embedding, every word is represented by a 10 dimensional vector (hvilke ord ligger tættest på et givent ord i modellen)
                        input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100)) #long short term model
    model.add(Dropout(0.1)) #during learning from the data and every iteration, remove 10% of the weights (90% of weights remains)  ### this is a finetuning parameter!!!!!!
    
    # Add Output Layer
    model.add(Dense(total_words, #Dense layer =  output layer
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0] #get vocav
        token_list = pad_sequences([token_list], 
                                    maxlen=max_sequence_len-1, 
                                    padding='pre') #pad them = overcome fixed dimensionality
        predicted = np.argmax(model.predict(token_list),
                                            axis=1) #
        
        output_word = "" #appending stuff and printing it to look nice
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title() 

## Load the data

In [3]:
data_dir = os.path.join("..", "..", "..", "431868", "news_data")

In [19]:
pd.read_csv(data_dir + "/"+ "ArticlesApril2017.csv")

Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,...,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245186,This project makes me happy to be a 30+ year T...,22022598.0,22022598,<br/>,comment,1.491237e+09,1,False,0.0,...,"Riverside, CA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
1,1491188619,Stunning photos and reportage. Infuriating tha...,22017350.0,22017350,,comment,1.491180e+09,1,False,0.0,...,<br/>,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
2,1491188617,Brilliant work from conception to execution. I...,22017334.0,22017334,<br/>,comment,1.491179e+09,1,False,0.0,...,Raleigh NC,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
3,1491167820,NYT reporters should provide a contributor's l...,22015913.0,22015913,<br/>,comment,1.491150e+09,1,False,0.0,...,"Missouri, USA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
4,1491167815,Could only have been done in print. Stunning.,22015466.0,22015466,<br/>,comment,1.491147e+09,1,False,0.0,...,"Tucson, Arizona",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243827,1493061963,"Sorry, but pudding has nothing to do with it; ...",22257227.0,22257227,<br/>,userReply,1.493059e+09,2,False,22255279.0,...,"New York, NY",,,22255279,58fd5c3d7c459f24986dbac5,Unknown,Culture,981.0,2,Review
243828,1493060948,"While it would be quite punny to spell it ""des...",22257466.0,22257466,<br/>,userReply,1.493061e+09,2,False,22255279.0,...,New York City,,,22255279,58fd5c3d7c459f24986dbac5,Unknown,Culture,981.0,2,Review
243829,1493087619,"See above comments. ""deserts"" is the proper.",22259265.0,22259265,<br/>,userReply,1.493076e+09,2,False,22253014.0,...,Boston,,,22253014,58fd5c3d7c459f24986dbac5,Unknown,Culture,981.0,2,Review
243830,1493042801,John Rubinstein had two brief scenes with Joan...,22250099.0,22250099,<br/>,userReply,1.493019e+09,2,False,22249901.0,...,"New York, NY",,,22249901,58fd5c3d7c459f24986dbac5,Unknown,Culture,981.0,2,Review


We're then going to load the data one at a time and append *only* the headlines to our list of data.

In [None]:
all_headlines = []
for filename in os.listdir(data_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(data_dir + filename)
        all_headlines.extend(list(article_df["headline"].values))#keep headline

We then clean up a little bit and see how many data points we have.

In [None]:
all_headlines = [h for h in all_headlines if h != "Unknown"] #list comprehension syntax (if the headline is unknown, remove it)
len(all_headlines)

We call out ```clean_text()``` function and then inspect the first 10 texts.

In [None]:
corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

## Tokenize

We're then going to tokenize our data, using the ```Tokenizer()``` class from ```TensorFlow```, about which you can read more [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

We then use the ```get_sequence_of_tokens()``` function we defined above, which turns every text into a sequence of tokens based on the vocabulary from the tokenizer.

In [None]:
# note: tensorflow also has a cleaning function--> it also can clean the data (u can use this instead of the predefined function)

tokenizer = Tokenizer() 
## tokenization
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [None]:
tokenizer.word_index #ordered after frequency, e.g. "the" is the most frequent word

In [None]:
inp_sequences = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10] #every headline is represented by the index of the word (sounds messy)
#ok
# 46, 1601,1,  are three words and their index e.g
#it then shows the "eskalering" af ord for ord indtil sætningen er constructed


(len(inp_sequences[1:10])) #maaaany input sequences

We then want to *pad* our input sequences to make them all the same length.
- Fordi vi er løbet ind i problemet med FIXED DIMENSIONALITY == how to fix, add a 0 :))

e.g.
sequence1 = my cat
sequence2 = my big cat
these are uneven in length, how to fix? add a 0

sequence1 = my cat 0
sequence2 = my big cat

.. always make sure to add so many 0's, to the longest sequence is covered
e.g. longest seq
sequence3= my big fat cat
then = my cat 0 0 

In [None]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences) 
#predictors = predictors of the next word in the sequence (its supervised, because we know the next word in the sequence)
#labels = 

## Create model

We then use the ```create_model()``` function created above to initialize a model, telling the model the length of sequences and the total size of the vocabulary.

In [None]:
model = create_model(max_sequence_len, total_words) #create_model == predefined function
model.summary()

#word embedding = 10 dimensional
# lstm = 100
#dropput = 100
# dense  =


#final prediction layer = 1137765

Model training is exactly the same as last week, but instead of document labels, we're fitting the model to predict next word.

*NB!* This will take some time to train! It took me 35 minutes on UCloud 32xCPU.

In [None]:
history = model.fit(predictors, #data (x)
                    label, #y
                    epochs=100, #arbitrarily, the more, the more accurate == timewise expensive 
                    batch_size=128, 
                    verbose=1) 

### NOTE!!! IF YOU RUN THIS AGAIN, IT HAS TRAINED ON 200 EPOCHS --> u must clear history or predefine model again (in the beginning of the script)

When the model has trained, we can then use this to generate *new text*.

In [None]:
print (generate_text("Russia", 5, model, max_sequence_len)) #predefined function
# the model has leaned form NY headlines, that what comes after "Russia" is probabilitywise closest to "Wanted" etc.
# 

## Using pre-trained word embeddings

Instead of having the embedding layer as a trainable parameter, we can instead using a *pretrained word embedding* model like ```word2vec```.

OTHER PPL HAVE TRAINED/CREATED BETTER WORD EMBEDDINGS, SO WE CAN ACTUALLT JUST IMPORT/TRANSFER/BORROW THESE AND USE THEM OUR SELVES.

In the following examples, we're using [GloVe embeddings](https://nlp.stanford.edu/projects/glove/). These are trained a little differently from ```word2vec``` but they behave in the same way.

In [None]:
path_to_glove_file = os.path.join("path/to/glove/vectors")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

We can define some variables that we're going to use later.

With hits and misses, we're counting how many words in the corpus vocabulary have a corresponding GloVe embedding; misses are the words which appear in our vocabulary but which do not have a GloVe embedding.

In [None]:
num_tokens = total_words
embedding_dim = 100
hits = 0
misses = 0

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer - notice that this is different
    model.add(Embedding(
            total_words,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=input_len)
    )
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(500))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary() #the word embeddings are so much more large, e.g we have over a million trainable parameters!!
#trainable param= 
#non-trainable param= 

In [None]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, 
                    verbose=1)

In [None]:
print (generate_text("china", 30, model, max_sequence_len))