## Data PreProcessing (for an LSTM)

In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import math
import torch

In [2]:
df = pd.read_csv('wikipedia_data10K.csv')

Article length will be 10 words to begin with, there are no articles less than 10 words so this wont cause any problems. The title will be the first word in order to create an autoregressive model and accomadate the shortest titles.

In [3]:
sentences = []
titles = []

sentence_length = 10

for index, row in df.iterrows():
    sentences.append(re.split(r'\W+', row['Text'])[:sentence_length]) #Use regex to split according to non-ascii characters (letters and numbers only)
    titles.append(row['Title'].split()[0])

In [4]:
def process_sentence(sentence):
    return [word.lower() for word in sentence]

processed_sentences = []
for sent in sentences:
    processed_sentences.append(process_sentence(sent))
    
print(processed_sentences[:5])
    

[['2016', 'video', 'game', '2016', 'video', 'game', 'phoenix', 'wright', 'ace', 'attorney'], ['ammonium', 'sulfate', 'precipitation', 'is', 'one', 'of', 'the', 'most', 'commonly', 'used'], ['kinki', 'may', 'refer', 'to', 'kansai', 'region', 'japan', 'also', 'called', 'the'], ['british', 'television', 'drama', 'series', '1992', '2010', 'this', 'article', 'is', 'about'], ['selo', 'in', 'sakha', 'republic', 'russia', 'uolba', 'уолба', 'selo', '1', 'flag']]


In [5]:
def process_title(title):
    return [title.lower()]

processed_titles = []
for title in titles:
    processed_titles.append(process_title(title))
    
processed_words = processed_sentences + processed_titles
    
print(processed_titles[:10])
print(processed_words)

[['phoenix'], ['ammonium'], ['kinki'], ['heartbeat'], ['uolba'], ['nenthorn'], ['greater'], ['roseburg'], ['kalita'], ['bradford']]
[['2016', 'video', 'game', '2016', 'video', 'game', 'phoenix', 'wright', 'ace', 'attorney'], ['ammonium', 'sulfate', 'precipitation', 'is', 'one', 'of', 'the', 'most', 'commonly', 'used'], ['kinki', 'may', 'refer', 'to', 'kansai', 'region', 'japan', 'also', 'called', 'the'], ['british', 'television', 'drama', 'series', '1992', '2010', 'this', 'article', 'is', 'about'], ['selo', 'in', 'sakha', 'republic', 'russia', 'uolba', 'уолба', 'selo', '1', 'flag'], ['human', 'settlement', 'in', 'scotland', 'nenthorn', 'nenthorn', 'location', 'within', 'the', 'scottish'], ['politics', 'of', 'ghana', 'constitution', 'executive', 'president', 'list', 'nana', 'akufo', 'addo'], ['veterans', 'cemetery', 'in', 'douglas', 'county', 'oregon', 'roseburg', 'national', 'cemetery', 'roseburg'], ['look', 'up', 'kalita', 'in', 'wiktionary', 'the', 'free', 'dictionary', 'kalita', 'ma

In [6]:
vocab_frequency = defaultdict(lambda: 0)
for i, sentence in enumerate(processed_sentences):
    for j, word in enumerate(sentence):
        vocab_frequency[word] += 1  

for title in processed_titles:
    vocab_frequency[title[0]] +=1

tokens = sorted(list(vocab_frequency.keys()))


print(f'Tokens in wikipedia data: {tokens[:10]}')
print(f'Number of tokens in the wikipedia sentences: {len(tokens)}')

Tokens in wikipedia data: ['', '(322756)', '0', '00', '000', '001', '00556', '01', '01611', '02']
Number of tokens in the wikipedia sentences: 20210


Load in pretrained glove embeddings using gensim word2vec

In [8]:
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

#Loads in the pretrained GLoVE embeddings
glove_input_file = 'glove.840B.300d.txt'  # Path to the GloVe file
glove_model = KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True)

glove_vocab_size = len(list(glove_model.key_to_index))
vector_size = glove_model.vector_size
print(f'The size of the glove vocabulary is: {glove_vocab_size}')
print(f'The size of the glove vector embedding is: {vector_size}')

The size of the glove vocabulary is: 2196016
The size of the glove vector embedding is: 300


In [9]:
unique_tokens = set(tokens) - set(glove_model.key_to_index)
print(f'The number of tokens unique to the wikipedia data is: {len(unique_tokens)}')

The number of tokens unique to the wikipedia data is: 6875


In [10]:
try: 
    tuned_embedding_model = Word2Vec.load("wikipedia_glove_50d.model")
    embedding_wv = tuned_embedding_model.wv

except:
    # Initialize new vectors for unique tokens
    # Create a dictionary of tokens and their corresponding random vectors
    unique_token_vectors = {token: np.random.rand(vector_size) for token in unique_tokens}

    # Add all vectors at once
    glove_model.add_vectors(list(unique_token_vectors.keys()), list(unique_token_vectors.values()))
    print(f'The size of the glove vocabulary is now: {glove_vocab_size}')

    # Initialize a Word2Vec model with your specifications
    w2v_model = Word2Vec(vector_size=vector_size, window=5, min_count=1, workers=4)
    w2v_model.build_vocab(processed_words) 
    
    # Update the Word2Vec model's vectors with GloVe vectors
    for word in glove_model.key_to_index:
        if word in w2v_model.wv:
            w2v_model.wv[word] = glove_model[word]

    #train the model on the sentences
    w2v_model.train(processed_words, total_examples=len(processed_words), epochs=100)

    w2v_model.save("wikipedia_glove_300d.model")
    tuned_embedding_model = Word2Vec.load("wikipedia_glove_300d.model")
    embedding_wv = tuned_embedding_model.wv

In [11]:
print(embedding_wv['sowkiyama'])
print(embedding_wv.most_similar('sowkiyama', topn=10))

[0.45330867 1.0642928  0.84878963 0.51782614 0.2802927  0.70893234
 0.3160189  0.07146805 0.9692529  0.8647118  0.39021382 0.4791659
 0.7206728  0.7916864  0.3824404  0.9035414  0.4607044  0.89700276
 0.16420059 0.29969966 0.21200772 0.8530581  0.50389254 0.7935326
 0.95522624 0.21783412 0.5226377  0.32038403 0.37859178 0.5649425
 1.3317353  0.70004797 0.09979377 0.47274697 0.601713   0.03755258
 0.18549764 0.8392658  0.2532998  0.418207   0.70637095 0.36627308
 0.9587399  0.121926   0.72291315 1.0729222  0.43057355 0.11389321
 0.85766923 0.08021853]
[('muthalali', 0.901416540145874), ('tuyú', 0.8799759745597839), ('4everevolution', 0.8796672224998474), ('itaboraian', 0.8774718046188354), ('zekra', 0.877371072769165), ('helograpsus', 0.8769294023513794), ('twilightning', 0.8712912797927856), ('goua', 0.87016761302948), ('futbal', 0.8697834014892578), ('chariesthes', 0.869722843170166)]


In [12]:
embedded_sentences = [[embedding_wv[word] for word in sentence] for sentence in processed_sentences]
embedded_titles = [embedding_wv[title] for title in processed_titles]

print(f'The shape of the data is: {len(embedded_sentences)} *  {len(embedded_sentences[0])} * {len(embedded_sentences[0][0])}')
print('Shape: num_sentences * sentence_length * embedding_dimensions')

The shape of the data is: 10000 *  10 * 50
Shape: num_sentences * sentence_length * embedding_dimensions


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_word(embedding, model):
    max_similarity = -1
    most_similar_word = None

    for word in model.key_to_index:
        sim = cosine_similarity([embedding], [model[word]])
        if sim > max_similarity:
            max_similarity = sim
            most_similar_word = word

    return most_similar_word

def translate_sentence(sentence, model):
    translation = ''
    for word in sentence:
        translation += find_closest_word(word, model) + ' '
    return translation


In [14]:
# Creating lists that will hold our input and target sample sequences
input_seq = []
target_seq = []

for sentence in embedded_sentences:
    target_seq.append(sentence)
for title in embedded_titles:
    input_seq.append(title)

test_num = 1
print(f'Input sequence length: {len(input_seq[test_num])}')
print(f'Target sequence length: {len(target_seq[test_num])}\n')
print(f'Input shape {np.array(input_seq).shape}')
print(f'Target shape {np.array(target_seq).shape}')
str = ' '.join(processed_sentences[test_num])
print(f'Original sentence: {str}')
print(f'Input sequence: {translate_sentence(input_seq[test_num], embedding_wv)}')
print(f'Translated sequence: {translate_sentence(target_seq[test_num], embedding_wv)}')

Input sequence length: 1
Target sequence length: 10

Input shape (10000, 1, 50)
Target shape (10000, 10, 50)
Original sentence: ammonium sulfate precipitation is one of the most commonly used
Input sequence: ammonium 
Translated sequence: ammonium sulfate precipitation is one of the most commonly used 


In [15]:
val_split = 0.1
num_sents = len(input_seq)

N = math.ceil(num_sents - (num_sents * val_split))

#Not random split, i want to see how the model predicts the end of the novel
train_x = np.array(input_seq[:N])
train_y = np.array(target_seq[:N])
val_x = np.array(input_seq[N:])
val_y = np.array(target_seq[N:])

print(f'Amount of training data: {len(train_x)}')
print(f'Amount of test data: {len(val_x)}')

Amount of training data: 9000
Amount of test data: 1000


In [16]:
train_input_seq = torch.Tensor(train_x).type(torch.DoubleTensor)
train_target_seq = torch.Tensor(train_y).type(torch.DoubleTensor)
val_input_seq = torch.Tensor(val_x).type(torch.DoubleTensor)
val_target_seq = torch.Tensor(val_y).type(torch.DoubleTensor)

In [17]:
torch.save(train_input_seq, 'train_input_seq_300d.pt')
torch.save(train_target_seq, 'train_target_seq_300d.pt')
torch.save(val_input_seq, 'val_input_seq_300d.pt')
torch.save(val_target_seq, 'val_target_seq_300d.pt')