In [2]:
# codes to mount your google drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Othercomputers/My Computer (1)/CS605_NLP_for_Smart_Assistants/Project/NLP-Lyric-Generator/src/bin

## How to use Sentiment class to compute sentiment scores of original and generated lyrics 

In [3]:
import sys
import os
import re
import numpy as np

from gensim import downloader
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TeYan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
from sentiment import Sentiment

In [5]:
PATH = '../../data'

In [6]:
dataset_file_names = os.listdir(PATH)
dataset_file_names[:5]

['Because it_s Singapore.txt',
 'City for the World.txt',
 'Count On Me Singapore.txt',
 'Everybody Is Special.txt',
 'Everything I Am.txt']

In [7]:
corpus = ''
for i,file in enumerate(dataset_file_names):
    text = open(PATH + '/' + file, mode='r').read()
    if i == 0:
        corpus += text
    else:
        corpus = corpus + '\n\n' + text

In [8]:
print(list(downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [9]:
glove_vectors = downloader.load('glove-twitter-25')

In [10]:
clean_corpus = re.sub(r'<[A-Z]+>|', '', corpus)
clean_corpus = re.sub(r'\n', ' ', clean_corpus)
clean_corpus = re.sub('\s+', ' ', clean_corpus)
clean_corpus = clean_corpus.strip()

stop_words = set(stopwords.words("english"))

In [19]:
# create Sentiment object
sentiment = Sentiment()

In [20]:
# clean and tokenize text
sentiment.clean_text(
    original_text = clean_corpus, 
    generated_text = 'I love Singapore', 
    remove_stopwords=True, 
    stop_words=stop_words
)
print(sentiment.original_tokens[:10])
print(sentiment.generated_tokens)

# alternatively, if text has already been cleaned and tokenized, 
# this step can be skipped by just initializing with the tokenized texts (uncomment next chunk)
# sentiment = Sentiment(
#     original_tokens=['step', 'step', 'together', 'build', 'dreams', 'heart', 'heart', 'together', 'stay', 'one'],
#     generated_tokens=['love', 'singapore']
# )

# OR, if only either original or generated has been cleaned & tokenized, can just pass in one of them. But do perform
# clean_text() on the untokenized one afterwards
# sentiment = Sentiment(
#     original_tokens=['step', 'step', 'together', 'build', 'dreams', 'heart', 'heart', 'together', 'stay', 'one'],
# )
# sentiment.clean_text(
#     generated_text = 'I love Singapore', 
#     remove_stopwords=True, 
#     stop_words=stop_words
# )

# sentiment = Sentiment(
#     generated_tokens=['love', 'singapore'],
# )
# sentiment.clean_text(
#     original_text = clean_corpus,  
#     remove_stopwords=True, 
#     stop_words=stop_words
# )

['step', 'step', 'together', 'build', 'dreams', 'heart', 'heart', 'together', 'stay', 'one']
['love', 'singapore']


In [21]:
# this step scores the vader sentiment of the original and generated text 
sentiment.score_vader_sentiment()  
print(sentiment.vader_sentiment_scores)

{'original': {'neg': 0.015, 'neu': 0.613, 'pos': 0.372, 'compound': 1.0}, 'generated': {'neg': 0.0, 'neu': 0.192, 'pos': 0.808, 'compound': 0.6369}}


In [22]:
# this step extracts the top n similar word vectors to each theme
# and get their average word vector
sentiment.get_theme_vector(
    sentiment_themes=['patriot', 'love'], 
    embedding=glove_vectors, 
    topn=10
)
print(sentiment.sentiment_themes)
print(sentiment.all_theme_vectors)

['patriot', 'love']
{'patriot': array([-0.3555834 ,  0.33084852, -0.712854  , -0.77890414, -0.13798162,
       -0.89783   ,  0.35511184, -0.5382708 ,  0.7230024 , -0.20963497,
        0.224243  ,  0.0498352 , -2.29359   ,  0.602347  ,  0.24020371,
       -0.43554345,  0.10030401,  0.0480234 ,  0.829517  ,  0.55805457,
       -0.427166  ,  0.5162381 , -0.63825256, -1.70959   , -0.0421189 ],
      dtype=float32), 'love': array([-0.8218921 ,  0.0957543 ,  0.1357924 ,  0.24185178, -0.52851546,
       -0.10268509,  1.8338101 ,  0.6263027 , -0.955307  , -0.2120904 ,
       -0.71401703,  0.29729578, -4.3994102 , -0.22828679, -0.4999215 ,
       -0.0293273 ,  0.16161294, -0.7575851 , -0.20355968,  0.04964399,
        0.0214574 ,  0.1089946 , -0.302731  ,  0.5473858 , -0.32484618],
      dtype=float32)}


In [23]:
# this step scores the sentiment of the original and generated text 
# by comparing them against the theme word vectors using cos sim 
sentiment.score_word_vector_sentiment()  
print(sentiment.word_vector_sentiment_scores)

{'original': {'patriot': 0.61356354, 'love': 0.9379613}, 'generated': {'patriot': 0.6044109, 'love': 0.92513263}}


In [24]:
# look at sentiment class methods and attributes
dir(sentiment)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'all_theme_vectors',
 'clean_text',
 'embedding',
 'generated_tokens',
 'get_theme_vector',
 'original_tokens',
 'score_vader_sentiment',
 'score_word_vector_sentiment',
 'sentiment_themes',
 'topn',
 'vader_sentiment_scores',
 'word_vector_sentiment_scores']

## Below codes are for testing (can ignore)

In [61]:


# arguments
# remove_stopwords = True
stop_words = set(stopwords.words("english"))

all_vectors = np.empty(0)

# preprocess, tokenize and remove stopwords
new_text = utils.preprocess_text(clean_corpus)
tokens = new_text.split(" ")
if remove_stopwords:
    tokens = [t for t in tokens if t not in stop_words]

# get word vector of each token
all_vectors = [glove_vectors[t] for t in tokens if t in glove_vectors]

all_vectors_avg = np.mean(all_vectors, axis=0)


def clean_text(text, **kwargs):
    """Clean text by standardized preprocessing, tokenize, and remove stopwords if specified

    Args:
      text (str): text to be cleaned
      remove_stopwords (bool): whether to remove stopwords or not
      stop_words (list): list of stopwords to be removed

    Returns:
      tokens (list): tokenized cleaned text
    """
    
    new_text = utils.preprocess_text(text)
    tokens = new_text.split(" ")
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]

    return tokens


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TeYang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [225]:
all_vectors_avg

array([-2.9950729e-01,  2.5934017e-01,  1.2592184e-02, -2.1609175e-03,
       -3.3929065e-01,  1.2586825e-02,  1.2069796e+00, -2.1457371e-01,
       -1.2859058e-01, -4.3410923e-02, -3.0766147e-01,  2.8144166e-01,
       -3.8353212e+00,  1.9896312e-01, -7.4839495e-02,  2.0032370e-01,
        1.2519662e-01, -3.4012094e-01,  4.4953417e-02, -2.1795860e-01,
       -1.9243003e-01,  7.5987183e-02, -3.2589704e-02,  4.1797020e-02,
       -1.4282893e-01], dtype=float32)

In [62]:
clean_text(clean_corpus, remove_stopwords=True, stop_words=stop_words)

['step',
 'step',
 'together',
 'build',
 'dreams',
 'heart',
 'heart',
 'together',
 'stay',
 'one',
 'nation',
 'undivided',
 'back',
 'back',
 'together',
 'brave',
 'heat',
 'cold',
 'storms',
 'hand',
 'hand',
 'together',
 'grow',
 'land',
 'call',
 'home',
 'nothing',
 'world',
 'compares',
 'singaporean',
 'life',
 'everyone',
 'family',
 'friend',
 'neighbour',
 'living',
 'harmony',
 'nothing',
 'world',
 'compares',
 'island',
 'home',
 'love',
 'know',
 'never',
 'alone',
 'singapore',
 'step',
 'step',
 'together',
 'build',
 'dreams',
 'heart',
 'heart',
 'together',
 'stay',
 'one',
 'nation',
 'undivided',
 'back',
 'back',
 'together',
 'brave',
 'heat',
 'cold',
 'storms',
 'hand',
 'hand',
 'together',
 'grow',
 'land',
 'call',
 'home',
 'nothing',
 'world',
 'compares',
 'singaporean',
 'life',
 'everyone',
 'family',
 'friend',
 'neighbour',
 'living',
 'harmony',
 'nothing',
 'world',
 'compares',
 'island',
 'home',
 'love',
 'know',
 'never',
 'alone',
 'nothin

In [44]:
# arguments
sentiment_themes = ["patriot", "love"]
all_theme_vector = {}

for theme in sentiment_themes:
    most_sim = [x[0] for x in glove_vectors.most_similar(theme, topn=10)]
    theme_vector = np.mean(glove_vectors[most_sim], axis=0)
    all_theme_vector[theme] = theme_vector


def get_theme_vector(sentiment_themes, embedding):
    """Compute the average vector for each given theme based on a specified word embedding

    Args:
      sentiment_themes (list): list of strings of the sentiment themes to compute
      embedding (gensim Word2VecKeyedVectors): the word embedding for extracting theme vectors

    Returns:
      all_theme_vector (dict): contains the theme & average word vector pairs
    """

    # initialize
    all_theme_vector = {}

    for theme in sentiment_themes:
        most_sim = [x[0] for x in embedding.most_similar(theme, topn=10)]
        theme_vector = np.mean(embedding[most_sim], axis=0)
        all_theme_vector[theme] = theme_vector

    return all_theme_vector


In [40]:
all_theme_vector

{'patriot': array([-0.3555834 ,  0.33084852, -0.712854  , -0.77890414, -0.13798162,
        -0.89783   ,  0.35511184, -0.5382708 ,  0.7230024 , -0.20963497,
         0.224243  ,  0.0498352 , -2.29359   ,  0.602347  ,  0.24020371,
        -0.43554345,  0.10030401,  0.0480234 ,  0.829517  ,  0.55805457,
        -0.427166  ,  0.5162381 , -0.63825256, -1.70959   , -0.0421189 ],
       dtype=float32),
 'love': array([-0.8218921 ,  0.0957543 ,  0.1357924 ,  0.24185178, -0.52851546,
        -0.10268509,  1.8338101 ,  0.6263027 , -0.955307  , -0.2120904 ,
        -0.71401703,  0.29729578, -4.3994102 , -0.22828679, -0.4999215 ,
        -0.0293273 ,  0.16161294, -0.7575851 , -0.20355968,  0.04964399,
         0.0214574 ,  0.1089946 , -0.302731  ,  0.5473858 , -0.32484618],
       dtype=float32)}

In [37]:
np.mean(glove_vectors[['you', 'haha']], axis=0).shape

(25,)

In [22]:
glove_vectors.similar_by_vector(all_vectors_avg)

[('there', 0.9834133386611938),
 ('way', 0.9821949601173401),
 ('it', 0.9725627899169922),
 ('every', 0.9713938236236572),
 ('all', 0.9710874557495117),
 ('have', 0.9680160284042358),
 ('this', 0.9659813642501831),
 ('where', 0.9656959772109985),
 ('only', 0.9645785093307495),
 ('and', 0.9639434218406677)]

In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\TeYan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores('hi how are you happy')

{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'compound': 0.5719}

In [22]:
sid.polarity_scores('hi happy')

{'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719}

In [39]:
class Sentiment:
    """
    A class for comparing sentiment scores of original text and generated text

    ...
    Attributes
    ----------
    original_tokens : list
        list of tokens for original text (defaults to None)
    generated_tokens : list
        list of tokens for generated text (defaults to None)
    ----------------------------------------------------------------------
    """

    def __init__(self, original_tokens=None, generated_tokens=None):

        self.original_tokens = original_tokens
        self.generated_tokens = generated_tokens

    def clean_text(self, **kwargs):
        """Clean text by standardized preprocessing, tokenize, and remove stopwords if specified

        Args:
        original_text (str, optional): original text to be cleaned
        generated_text (str, optional): generated text to be cleaned
        remove_stopwords (bool): whether to remove stopwords or not
        stop_words (list): list of stopwords to be removed. Required if remove_stopwords is True
        """
        
        if kwargs.get('original_text'):
            original_new_text = utils.preprocess_text(kwargs['original_text'])
            original_tokens = original_new_text.split(" ")
            if kwargs.get('remove_stopwords'):
                original_tokens = [t for t in original_tokens if t not in kwargs['stop_words']]
            self.original_tokens = original_tokens

        if kwargs.get('generated_text'):
            generated_new_text = utils.preprocess_text(kwargs['generated_text'])
            generated_tokens = generated_new_text.split(" ")
            if kwargs.get('remove_stopwords'):
                generated_tokens = [t for t in generated_tokens if t not in kwargs['stop_words']]
            self.generated_tokens = generated_tokens


    @staticmethod
    def get_theme_vector(sentiment_themes, embedding, topn=10):
        """Compute the average vector for each given theme based on a specified word embedding

        Args:
        sentiment_themes (list): list of strings of the sentiment themes to compute
        embedding (gensim Word2VecKeyedVectors): the word embedding for extracting theme vectors
        topn (int): determine the top n similar words to the theme for extraction (defaults to 10)
        """

        # initialize
        all_theme_vectors = {}

        sentiment_themes = [theme.lower() for theme in sentiment_themes]
        for theme in sentiment_themes:
            most_sim = [x[0] for x in embedding.most_similar(theme, topn=topn)]
            theme_vector = np.mean(embedding[most_sim], axis=0)
            all_theme_vectors[theme] = theme_vector

        Sentiment.sentiment_themes = sentiment_themes
        Sentiment.all_theme_vectors = all_theme_vectors
        Sentiment.embedding = embedding
        Sentiment.topn = topn


    def score_word_vector_sentiment(self):
        """Compute the cosine similarity score of each text with each theme
        """
        
        # check if original & generated tokens exist
        if not self.original_tokens or not self.generated_tokens:
            error_text = """ORIGINAL or GENERATED tokens does NOT exist.
            Either pass it into the class using Sentimentality(original_text=tokens, generated_text=tokens)
            or use the clean_text method. """
            raise AttributeError(error_text)

        # check if theme average vectors exist
        try: self.all_theme_vectors
        except AttributeError as error:
            # print(error)

            error_text = """No theme vector exists. Run get_theme_vector() to compute them.
            For more info, refer to help(Sentimentality.get_theme_vector)"""
            raise AttributeError(error_text)

        # initialize
        sentiment_scores = {'original':{}, 'generated':{}}

        # get mean word vector of original and generated text
        original_vectors = [self.embedding[t] for t in self.original_tokens if t in self.embedding]
        original_vectors_avg = np.mean(original_vectors, axis=0)
        generated_vectors = [self.embedding[t] for t in self.generated_tokens if t in self.embedding]
        generated_vectors_avg = np.mean(generated_vectors, axis=0)

        for theme, vector in self.all_theme_vectors.items():
            original_cossim = np.dot(original_vectors_avg,vector)/(norm(original_vectors_avg)*norm(vector))
            sentiment_scores['original'][theme] = original_cossim
            generated_cossim = np.dot(generated_vectors_avg,vector)/(norm(generated_vectors_avg)*norm(vector))
            sentiment_scores['generated'][theme] = generated_cossim

        self.word_vector_sentiment_scores = sentiment_scores

    def score_vader_sentiment(self):
        """Compute the vader sentiment score of original & generated text 
        """
        
        # check if original & generated tokens exist
        if not self.original_tokens or not self.generated_tokens:
            error_text = """ORIGINAL or GENERATED tokens does NOT exist.
            Either pass it into the class using Sentimentality(original_text=tokens, generated_text=tokens)
            or use the clean_text method. """
            raise AttributeError(error_text)

        # initialize
        sentiment_scores = {'original':{}, 'generated':{}}

        # get mean vader sentiment score for original & generated text
        sid = SentimentIntensityAnalyzer()

        original_scores = sid.polarity_scores(' '.join(self.original_tokens))
        generated_scores = sid.polarity_scores(' '.join(self.generated_tokens))
        sentiment_scores['original'] = original_scores
        sentiment_scores['generated'] = generated_scores

        self.vader_sentiment_scores = sentiment_scores

In [40]:
stop_words = set(stopwords.words("english"))
from numpy.linalg import norm

In [41]:
sentiment = Sentiment()

In [43]:
sentiment.clean_text(
    original_text = clean_corpus, 
    generated_text = 'I love Singapore', 
    remove_stopwords=True, 
    stop_words=stop_words
)
sentiment.get_theme_vector(['patriot', 'love'], glove_vectors)
sentiment.score_word_vector_sentiment()
sentiment.score_vader_sentiment()

In [45]:
sentiment.vader_sentiment_scores

{'original': {'neg': 0.015, 'neu': 0.613, 'pos': 0.372, 'compound': 1.0},
 'generated': {'neg': 0.0, 'neu': 0.192, 'pos': 0.808, 'compound': 0.6369}}

In [44]:
dir(sentiment)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'all_theme_vectors',
 'clean_text',
 'embedding',
 'generated_tokens',
 'get_theme_vector',
 'original_tokens',
 'score_vader_sentiment',
 'score_word_vector_sentiment',
 'sentiment_themes',
 'topn',
 'vader_sentiment_scores',
 'word_vector_sentiment_scores']

In [245]:
sent.sentiment_scores

{'original': {'patriot': 0.6135636, 'love': 0.9379613},
 'generated': {'patriot': 0.6044109, 'love': 0.92513263}}