In [None]:
import numpy as np
from __future__ import division

# Constants
EMBEDDING_FILE = 'glove.6B.50d.txt'
UNKNOWN_TOKEN = '<UNK>'
END_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
EPSILON = 0.0001

def load_word_embeddings(file_path):
    """Load pre-trained word embeddings from file."""
    vocabulary = []
    embeddings = []
    
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            vocabulary.append(parts[0])
            embeddings.append(parts[1:])
    
    print('Word vector embeddings loaded successfully.')
    return vocabulary, embeddings

# Load pre-trained word embeddings
vocabulary, embeddings = load_word_embeddings(EMBEDDING_FILE)
embedding_dim = len(embeddings[0])  # Dimension of each word vector

# Prepare special tokens
small_value = np.zeros((embedding_dim,), dtype=np.float32) + EPSILON

# Add unknown token
vocabulary.append(UNKNOWN_TOKEN)
unk_embedding = np.asarray(embeddings[vocabulary.index('unk')], dtype=np.float32) + small_value

# Add end-of-sentence token
vocabulary.append(END_TOKEN)
eos_embedding = np.asarray(embeddings[vocabulary.index('eos')], dtype=np.float32) + small_value

# Add padding token
vocabulary.append(PAD_TOKEN)

# Check if special tokens already exist in embeddings
has_unk = False
has_eos = False

for vector in embeddings:
    if np.all(np.equal(np.asarray(vector, dtype=np.float32), unk_embedding)):
        has_unk = True
        print("UNK token already exists in embeddings")
    if np.all(np.equal(np.asarray(vector, dtype=np.float32), eos_embedding)):
        has_eos = True
        print("EOS token already exists in embeddings")

# Add special token embeddings if they don't exist
if not has_unk:
    embeddings.append(unk_embedding)
if not has_eos:
    embeddings.append(eos_embedding)

# Add zero vector for padding
pad_embedding = np.zeros(embedding_dim, dtype=np.float32)
embeddings.append(pad_embedding)

# Convert to numpy array
embedding_matrix = np.asarray(embeddings, dtype=np.float32)

Word vector embeddings Loaded.


In [None]:
def word_to_vector(word, vocabulary, embedding_matrix):
    """Convert a word to its vector representation.
    
    Args:
        word (str): The word to convert to vector
        vocabulary (list): List of words in the vocabulary
        embedding_matrix (np.ndarray): Matrix containing word embeddings
        
    Returns:
        np.ndarray: Vector representation of the word
    """
    if word in vocabulary:
        return embedding_matrix[vocabulary.index(word)]
    return embedding_matrix[vocabulary.index(UNKNOWN_TOKEN)]


In [None]:
def find_most_similar_by_cosine(query_vector, embedding_matrix):
    """Find most similar words using cosine similarity.
    
    Args:
        query_vector (np.ndarray): The vector to compare against
        embedding_matrix (np.ndarray): Matrix containing all word embeddings
        
    Returns:
        np.ndarray: Indices of words sorted by cosine similarity (descending)
    """
    # Compute dot products
    dot_products = np.sum(np.multiply(embedding_matrix, query_vector), axis=1)
    
    # Compute vector magnitudes
    query_norm = np.sqrt(np.sum(np.square(query_vector)))
    embedding_norms = np.sqrt(np.sum(np.square(embedding_matrix), axis=1))
    
    # Compute cosine similarities
    cosine_similarities = np.divide(dot_products, 
                                  np.multiply(query_norm, embedding_norms))
    
    # Return indices sorted by similarity (highest first)
    return np.flip(np.argsort(cosine_similarities), axis=0)


def find_most_similar_by_euclidean(query_vector, embedding_matrix):
    """Find most similar words using Euclidean distance.
    
    Args:
        query_vector (np.ndarray): The vector to compare against
        embedding_matrix (np.ndarray): Matrix containing all word embeddings
        
    Returns:
        np.ndarray: Indices of words sorted by Euclidean distance (ascending)
    """
    differences = np.subtract(embedding_matrix, query_vector)
    squared_differences = np.square(differences)
    sum_squares = np.sum(squared_differences, axis=1)
    euclidean_distances = np.sqrt(sum_squares)
    return np.argsort(euclidean_distances)


# Example usage
target_word = 'frog'
top_n = 10

# Get vector for target word
word_vector = word_to_vector(target_word, vocabulary, embedding_matrix)

# Find most similar words
similar_indices = find_most_similar_by_euclidean(word_vector, embedding_matrix)

# Display results
print(f"TOP {top_n} MOST SIMILAR WORDS TO '{target_word}':\n")
for rank, index in enumerate(similar_indices[:top_n], start=1):
    print(f"{rank}. {vocabulary[index]}")
    

TOP TEN MOST SIMILAR WORDS TO 'frog':

1. frog
2. snake
3. ape
4. toad
5. monkey
6. spider
7. lizard
8. tarantula
9. cat
10. spiny


In [None]:
def vector_to_word(query_vector, vocabulary, embedding_matrix):
    """Convert a vector to its most similar word in the vocabulary.
    
    Args:
        query_vector (np.ndarray): The vector representation to convert
        vocabulary (list): List of words in the vocabulary
        embedding_matrix (np.ndarray): Matrix containing word embeddings
        
    Returns:
        str: The most similar word in the vocabulary
        
    Note:
        Uses Euclidean distance to find the closest word embedding
    """
    # Ensure input is properly formatted as numpy array
    query_vector = np.asarray(query_vector, dtype=np.float32)
    
    # Find indices of most similar words (sorted by ascending Euclidean distance)
    similar_word_indices = find_most_similar_by_euclidean(query_vector, embedding_matrix)
    
    # Return the closest matching word
    return vocabulary[similar_word_indices[0]]

In [None]:
import csv
import string
from nltk.tokenize import word_tokenize

# Constants
MAX_DATA_POINTS = 100000
CSV_FILE_PATH = 'Reviews.csv'
TEXT_COLUMN = 'Text'
SUMMARY_COLUMN = 'Summary'

def clean_text(text):
    """Clean and normalize text by:
    1. Converting to lowercase
    2. Removing non-printable characters
    3. Removing punctuation
    
    Args:
        text (str): Input text to clean
        
    Returns:
        str: Cleaned text
    """
    text = text.lower()
    # Remove non-printable characters
    printable_chars = set(string.printable)
    text = ''.join(filter(lambda x: x in printable_chars, text))
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def load_review_data(file_path, max_samples):
    """Load and preprocess review data from CSV file.
    
    Args:
        file_path (str): Path to CSV file
        max_samples (int): Maximum number of samples to load
        
    Returns:
        tuple: (list of tokenized texts, list of tokenized summaries)
    """
    tokenized_texts = []
    tokenized_summaries = []
    
    with open(file_path, 'r', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        for count, row in enumerate(reader):
            if count >= max_samples:
                break
                
            cleaned_text = clean_text(row[TEXT_COLUMN])
            cleaned_summary = clean_text(row[SUMMARY_COLUMN])
            
            tokenized_texts.append(word_tokenize(cleaned_text))
            tokenized_summaries.append(word_tokenize(cleaned_summary))
            
    return tokenized_texts, tokenized_summaries

# Load and preprocess the data
texts, summaries = load_review_data(CSV_FILE_PATH, MAX_DATA_POINTS)

In [None]:
# Configuration constants
MAX_TEXT_LENGTH = 80
MAX_SUMMARY_LENGTH = 4

def filter_data_by_length(texts, summaries, max_text_len, max_summary_len):
    """Filter text-summary pairs based on length constraints.
    
    Args:
        texts (list): List of tokenized texts
        summaries (list): List of tokenized summaries
        max_text_len (int): Maximum allowed text length
        max_summary_len (int): Maximum allowed summary length
        
    Returns:
        tuple: (filtered_texts, filtered_summaries)
    """
    filtered_texts = []
    filtered_summaries = []
    
    for text, summary in zip(texts, summaries):
        if len(text) <= max_text_len and len(summary) <= max_summary_len:
            filtered_texts.append(text)
            filtered_summaries.append(summary)
    
    return filtered_texts, filtered_summaries

# Filter the data
filtered_texts, filtered_summaries = filter_data_by_length(
    texts, summaries, MAX_TEXT_LENGTH, MAX_SUMMARY_LENGTH
)

# Output results
print(f"Current size of filtered data: {len(filtered_texts)}")

Current size of data: 48478


In [None]:
def filter_by_vocabulary(texts, summaries, vocabulary):
    """Filter text-summary pairs where all summary words are in vocabulary.
    
    Args:
        texts (list): List of tokenized texts
        summaries (list): List of tokenized summaries
        vocabulary (list): List of known vocabulary words
        
    Returns:
        tuple: (filtered_texts, filtered_summaries)
    """
    filtered_texts = []
    filtered_summaries = []
    
    vocabulary_set = set(vocabulary)  # Convert to set for faster lookups
    
    for text, summary in zip(texts, summaries):
        # Check if all words in summary are in vocabulary
        if all(word in vocabulary_set for word in summary):
            filtered_summaries.append(summary)
            filtered_texts.append(text)
    
    return filtered_texts, filtered_summaries

# Filter the data
filtered_texts_vocab, filtered_summaries_vocab = filter_by_vocabulary(
    texts_v2, summaries_v2, vocab
)

# Output results
print(f"Current size of vocabulary-filtered data: {len(filtered_texts_vocab)}")

Current size of data: 44413


In [9]:
#REDUCE DATA (FOR SPEEDING UP THE NEXT STEPS)

MAXIMUM_DATA_NUM = 20000

texts = texts_v3[0:MAXIMUM_DATA_NUM]
summaries = summaries_v3[0:MAXIMUM_DATA_NUM]

In [10]:
import random

index = random.randint(0,len(texts)-1)

print "SAMPLE CLEANED & TOKENIZED TEXT: \n\n"+str(texts[index])
print "\nSAMPLE CLEANED & TOKENIZED SUMMARY: \n\n"+str(summaries[index])

SAMPLE CLEANED & TOKENIZED TEXT: 

['our', 'boston', 'terrier', 'loves', 'these', 'bones', 'we', 'give', 'them', 'to', 'her', 'as', 'a', 'treat', 'or', 'to', 'keep', 'her', 'busy', 'when', 'we', 'have', 'company', 'for', 'a', '16', 'lbs', 'dog', 'shes', 'a', 'mighty', 'chewer', 'and', 'these', 'last', 'her', 'a', 'couple', 'of', 'hours', 'with', 'breaks', 'to', 'investigate', 'if', 'shes', 'missing', 'anything', 'well', 'buy', 'more', 'of', 'these']

SAMPLE CLEANED & TOKENIZED SUMMARY: 

['chloe', 'loves', 'them']


In [11]:
vocab_limit = []
i=0
for text in texts:
    for word in text:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)

In [12]:
for summary in summaries:
    for word in summary:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)

In [13]:
vocab_limit.append('<EOS>')
vocab_limit.append('<UNK>')
vocab_limit.append('<PAD>') 

In [14]:
lentexts = []

i=0
for text in texts:
    lentexts.append(len(text))
    i+=1
    
sortedindex = np.argsort(lentexts)
#sort indexes according to the sequence length of corresponding texts. 

In [15]:
batch_size = 50

bi=0

batches_x = []
batches_y = []
batch_x = []
batch_y = []

for i in xrange(0,len(texts)):
    
    if bi>=batch_size:
        bi=0
        batches_x.append(batch_x)
        batches_y.append(batch_y)
        batch_x = []
        batch_y = []
        
    batch_x.append(texts[int(sortedindex[i])])
    batch_y.append(summaries[int(sortedindex[i])])
    
    bi+=1
    

In [16]:
import math

vec_batches_x = []
vec_batches_x_pe = []

for batch in batches_x:
 
    max_len_x = len(batch[batch_size-1])
    vec_texts = []
    vec_texts_pe = []
    
    for text in batch:
        
        vec_text=[]
        vec_text_pe = []
    
        pos=0
        
        for word in text:
            
            pe = np.zeros((word_vec_dim,),np.float32)
            #positional encoding
            
            for i in xrange(0,word_vec_dim):
                pe[i] = math.sin(pos/math.pow(10000,(2*i/word_vec_dim)))
            
            vec_text.append(word2vec(word))
            
            ped = np.asarray(word2vec(word),np.float32) + pe
            
            vec_text_pe.append(ped)
            
            pos=pos+1
        
        n = len(vec_text)
        
        while n<max_len_x:
            
            vec_text.append(word2vec('<PAD>'))
            vec_text_pe.append(word2vec('<PAD>'))
            n = len(vec_text)
        
        vec_texts.append(vec_text)
        vec_texts_pe.append(vec_text_pe)
    
    vec_texts = np.asarray(vec_texts,np.float32)
    vec_batches_x.append(vec_texts)
    
    vec_texts_pe = np.asarray(vec_texts_pe,np.float32)
    vec_batches_x_pe.append(vec_texts_pe)
    

In [17]:
vec_batches_y = []
#vec_batches_y_pe = []

#k=0
for batch in batches_y:

    max_len_y = max_len_sum+1
    vec_summaries = []

    for summary in batch:
        
        vec_summary=[]
        for word in summary:
            vec_summary.append(word2vec(word))
        
        vec_summary.append(word2vec('<EOS>'))
        
        n = len(vec_summary)

        while n<max_len_y:
            vec_summary.append(word2vec('<PAD>'))
            n = len(vec_summary)
        #print n
        
        vec_summaries.append(vec_summary)
    
    vec_summaries = np.asarray(vec_summaries,np.float32)
    vec_batches_y.append(vec_summaries)

In [18]:
#Saving processed data in another file.

import pickle

PICK = [vocab_limit,batch_size,vec_batches_x,vec_batches_y,vec_batches_x_pe,vec]

with open('AmazonPICKLE', 'wb') as fp:
    pickle.dump(PICK, fp)