# General preparation


In [1]:
import pickle

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)


def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

# dataLoader
We will first present some examples for how to handle classes in the dataLoder file

In [2]:
from processSST import Sentence, SentimentTreeBank, SentimentTreeNode
from processSST import NEUTRAL_SENTIMENT, POSITIVE_SENTIMENT, NEGATIVE_SENTIMENT
import random

dataset = SentimentTreeBank()
# this will return a list of Sentence object, Sentence object contains a tree of SentimentTreeNode object. 
sentences = dataset.get_train_set()[:1000]

In [3]:
print(sentences[0].sentiment_val)

0.29167


In [4]:

def get_negated_polarity_examples(sentences_list, num_examples=None, choose_random=False):
    """
    Returns the indices of the sentences in sentences_list which have subphrase in the second level with
    sentiment polarity different than the whole sentence polarity.
    :param sentences_list: list of Sentence objects
    :param num_examples: number of examples to return, if None all of them are returned
    :param choose_random: relevant only if num_examples is lower than the number of exisitng negated
    polarity examples in sentences_list
    """

    if num_examples is None:
        num_examples = len(sentences_list)  # take all possible sentences

    def is_polarized(sent: Sentence):
        if sent.sentiment_class == NEUTRAL_SENTIMENT:
            return False
        else:
            root_polarity = sent.sentiment_class
            for child in sent.root.children:
                if child.sentiment_class == 1 - root_polarity:
                    return True
            return False

    indexed_sentences = list(enumerate(sentences_list)) # in format of [(index, sentences), ...]
    negated_sentences = list(filter(lambda s: is_polarized(s[1]), indexed_sentences))
    negated_sentences_indices = [i for i, s in negated_sentences]
    
    # select number of samples we want
    if len(negated_sentences) <= num_examples:
        return negated_sentences_indices
    else:
        if choose_random:
            random.shuffle(negated_sentences_indices)
        return negated_sentences_indices[:num_examples]
get_negated_polarity_examples(sentences, num_examples=5)

[0, 14, 54, 109, 113]

In [5]:
def get_sentiment_words(sent: Sentence):
    sent_polarity = sent.sentiment_class
    return [node for node in sent.get_leaves() if node.sentiment_class == sent_polarity]
sent = sentences[54]
nodes = get_sentiment_words(sent)
[node.text for node in nodes]

[['charms'], ['co'], ['stars']]

In [6]:
def get_rare_words_examples(sentences_list, dataset: SentimentTreeBank,
                            num_sentences=50):
    """
    Computes for each sentence in sentences the maximal train frequency of sentiment word, where sentiment
    word is a word which is labeled with either positive or negative sentiment value, and returns the
    indices of the <num_sentences> sentences with lowest value.
    :param sentences_list: list of Sentence objects
    :param dataset: the SentimentTreebank datset object
    :param num_sentences: number of sentences to return
    :return: list of ints representing the indices of the chosen sentences out of the input sentences_list
    """
    word_counts = dataset.get_train_word_counts()

    def get_count(word_node: SentimentTreeNode):
        word_text = word_node.text[0]
        if word_text in word_counts:
            return word_counts[word_text]
        else:
            return 0

    indexed_sentences = list(enumerate(sentences_list))
    indexed_sentences = list(filter(lambda s: len(get_sentiment_words(s[1])) > 0, indexed_sentences))
    # sort sentence by the highest of the count of each of its words, the smaller one is in the front
    indexed_sentences = sorted(indexed_sentences, key=lambda s: max([get_count(node) for node in
                                                                     get_sentiment_words(s[1])]))
    indices = [i for i, s in indexed_sentences]
    return indices[:num_sentences]

get_rare_words_examples(sentences, dataset, num_sentences=5)


[13, 32, 37, 76, 198]

# dataManager
Let's give some example for how to use the dataManager

## encode sentence as one hot
The idea is to get a ordered list of all the words, and then simply encode each word as a one hot vector of the size of list of all words.  
Then we simply encode the sentence by the average of the one hot vectors of the words.

In [7]:
import numpy as np
from dataManager import DataManager
from processSST import Sentence
def get_one_hot(size, ind):
    """
    this method returns a one-hot vector of the given size, where the 1 is placed in the ind entry.
    we can safely assume that ind < size
    :param size: the size of the vector
    :param ind: the entry index to turn to 1
    :return: numpy ndarray which represents the one-hot vector
    """
    one_hot = np.zeros((size,))
    one_hot[ind] = 1
    return one_hot

def get_word_to_ind(words_list):
    """
    this function gets a list of words, and returns a mapping between
    words to their index.
    Can word repeat? Assume no? assume yes
    :param words_list: a list of words
    :return: the dictionary mapping words to the index
    """
    word_to_index = {}
    for i in range(len(words_list)):
        word = words_list[i]
        if word not in word_to_index:
            word_to_index[word] = i
    return word_to_index

def average_one_hots(sent: Sentence, word_to_ind: dict):
    """
    this method gets a sentence, and a mapping between words to indices, and returns the average
    one-hot embedding of the tokens in the sentence.
    assume all word in sent.text is in word_to_ind.key
    :param sent: a sentence object.
    :param word_to_ind: a mapping between words to indices
    :return:
    """
    text = sent.text
    size_of_one_hot = max(word_to_ind.values()) + 1
    all_one_hot = np.zeros(size_of_one_hot)
    for word in text:
        ind_of_word = word_to_ind[word]
        all_one_hot += get_one_hot(size_of_one_hot, ind_of_word)
    return all_one_hot / len(text)



In [8]:
from dataManager import DataManager
# load the dataset
dataset = SentimentTreeBank()
# the embedding function is average one hot
sent_func = average_one_hots
# get the dictionary that map word to index
words_list = list(dataset.get_word_counts().keys())
word_to_ind = get_word_to_ind(words_list)
# define the parameters for the embedding function
sent_func_kwargs = {"word_to_ind":word_to_ind }
# pass it to the dataManager
data_manager = DataManager(use_sub_phrases=False, 
                                       sentiment_dataset=dataset, 
                                       sent_func=sent_func, sent_func_kwargs=sent_func_kwargs, 
                                       batch_size=50)
data_manager

<dataManager.DataManager at 0x1179b2ef0>

In [9]:
# get the pyTorch DataLoader
train_set_dataloader = data_manager.get_torch_iterator(data_subset="train")
train_set_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1278a38e0>

## Encode sentence by Word2Vec
The idea is similar to above: we first embed a word into a vector, then we will calculate the embedding of the sentence by the average of the embedding of words

First we do some preparation: if Word2Vec already exist, then we will load it, else we will download it. 

Need to use gensim 4.3.2 (current version) will "from scipy.linalg import get_blas_funcs, triu", and triu is removed from scipy 1.12. And we can't install scipy 1.11. I tried to download the file and write a load function for it, but there is little information on the internet, everyone is using gensim.  
So eventually my solution is to use gensim 4.3.2 and python 3.10. We can install scipy 1.11.0 with python 3.10, and it solves the problem. besides, gensim.downloader.load("word2vec-google-news-300") seems stop working. So we will have to download the file from
 https://code.google.com/archive/p/word2vec/    
 (1.5 GB) and unzip it (not sure if it's necessary), and then use from gensim.models import KeyedVectors to solve this problem. 

In [17]:
import gensim.downloader
from gensim.models import KeyedVectors

def load_word2vec():
    # doesn't work
    # word2vec_model = gensim.downloader.load("word2vec-google-news-300")
    word2vec_file = 'TempFiles/GoogleNews-vectors-negative300.bin'
    word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)
    return word2vec_model

In [18]:
import os


def create_or_load_slim_w2v(words_list, cache_w2v=True):
    """
    We are trying to get a smaller word2vec dictionary: word2vec dict only for words which appear in the training dataset.
    :param words_list: list of words to use for the w2v dict
    :param cache_w2v: whether to save locally the small w2v dictionary
    :return: dictionary which maps the known words to their vectors
    """
    w2v_path = "TempFiles/w2v_dict.pkl"
    if not os.path.exists(w2v_path):
        full_w2v = load_word2vec()
        w2v_emb_dict = {k: full_w2v[k] for k in words_list if k in full_w2v}
        if cache_w2v:
            save_pickle(w2v_emb_dict, w2v_path)
    else:
        w2v_emb_dict = load_pickle(w2v_path)
    return w2v_emb_dict


In [19]:
def get_w2v_average(sent, word_to_vec, embedding_dim):
    """
    This method gets a sentence and returns the average word embedding of the words consisting
    the sentence.
    :param sent: the sentence object
    :param word_to_vec: a dictionary mapping words to their vector embeddings
    :param embedding_dim: the dimension of the word embedding vectors, we need this parameter because we might meet the situation that all the words in the sentence does not have a word2vec embedding, then we will manually pass in the size of the embedding, and set it all to zero
    :return The average embedding vector as numpy ndarray.
    """
    word_embeddings = []
    for word in sent.text:
        try:
            word_embedding = word_to_vec[word]
            # we assume word_embedding is of dimension embedding_dim
            word_embeddings.append(word_embedding)
            # average without unknown
        except:
            pass
    if len(word_embeddings) == 0:
        default_sentence_embedding = np.zeros(embedding_dim)
        return default_sentence_embedding
    return np.mean(word_embeddings, axis=0)

In [20]:
from dataManager import DataManager
# load the dataset
dataset = SentimentTreeBank()
# the function that will map a sentence to vector is get_w2v_average
sent_func = get_w2v_average
# The param it takes other than the Sentence object: word2Vec_dic, W2V_EMBEDDING_DIM
# initialize the dictionary that map a word to Word2Vec vectors
words_list = list(dataset.get_word_counts().keys())
word2Vec_dic = create_or_load_slim_w2v(words_list)
# We just know that the embedding size of word2Vec is 300
W2V_EMBEDDING_DIM = 300
sent_func_kwargs = {"word_to_vec": word2Vec_dic, "embedding_dim": W2V_EMBEDDING_DIM}
# pass it to the dataManager
data_manager = DataManager(use_sub_phrases=False, 
                                       sentiment_dataset=dataset, 
                                       sent_func=sent_func, sent_func_kwargs=sent_func_kwargs, 
                                       batch_size=50)

In [21]:
# then as above, let's see the DataLoder for training set
train_set_dataloader = data_manager.get_torch_iterator(data_subset="train")
train_set_dataloader

<torch.utils.data.dataloader.DataLoader at 0x3a011b790>

## sequence of embeddings
We will try another way to embed a sentence, fix a length SEQ_LEN = 52, we will embed every word in a sentence by word2vec, and then we will embed a sentence by a list of all the word2vec embeddings. And if the sentence is longer than SEQ_LEN, we will crop it, if it is shorter than SEQ_LEN = 52, we will pad the rest with zero word2vec embeddings. So the embedding of a sentence will be (SEQ_LEN = 52, len_word2vec = 300)

In [16]:
SEQ_LEN = 52
def sentence_to_embedding(sent, word_to_vec, seq_len=SEQ_LEN, embedding_dim=300):
    """
    this method gets a sentence and a word to vector mapping, and returns a list containing the
    words embeddings of the tokens in the sentence.
    :param sent: a sentence object
    :param word_to_vec: a word to vector mapping.
    :param seq_len: the fixed length for which the sentence will be mapped to.
    :param embedding_dim: the dimension of the w2v embedding
    :return: numpy ndarray of shape (seq_len, embedding_dim) with the representation of the sentence
    """
    sentence_embedding = np.zeros((seq_len, embedding_dim))
    for i in range(min([len(sent.text), seq_len])):
        word = sent.text[i]
        try:
            word_embedding = word_to_vec[word]
            sentence_embedding[i] = word_embedding
        except:
            pass
    return sentence_embedding

In [22]:
from dataManager import DataManager
# load the dataset
dataset = SentimentTreeBank()
# the function that will map a sentence to vector is get_w2v_average
sent_func = sentence_to_embedding
# The param it takes other than the Sentence object: word2Vec_dic, W2V_EMBEDDING_DIM
# initialize the dictionary that map a word to Word2Vec vectors
words_list = list(dataset.get_word_counts().keys())
word2Vec_dic = create_or_load_slim_w2v(words_list)
# We just know that the embedding size of word2Vec is 300
W2V_EMBEDDING_DIM = 300
SEQ_LEN = 52
sent_func_kwargs = {"seq_len": SEQ_LEN, "word_to_vec": word2Vec_dic, "embedding_dim": W2V_EMBEDDING_DIM}
# pass it to the dataManager
data_manager = DataManager(use_sub_phrases=False, 
                                       sentiment_dataset=dataset, 
                                       sent_func=sent_func, sent_func_kwargs=sent_func_kwargs, 
                                       batch_size=50)
# then as above, let's see the DataLoder for training set
train_set_dataloader = data_manager.get_torch_iterator(data_subset="train")
train_set_dataloader

<torch.utils.data.dataloader.DataLoader at 0x2a3e122f0>