# Setup

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

from pandas import read_excel

data_path = './sentiment_analysis_data'

## Import data

### Hu & Liu Lexicon
This cell gets lexicons with a simple flavor, pulling in about 6,800 positive and negative words from a study by Minqing Hu and Bing Liu.

In [42]:
sentiments = ['Positive', 'Negative']
sentiment_path_map = {
    sentiments[0]: data_path + '/hu_positive_lexicon.txt',
    sentiments[1]: data_path + '/hu_negative_lexicon.txt',
}
sentiment_lexicon_map = {s: set() for s in sentiments}

for sentiment in sentiments:
    path = sentiment_path_map[sentiment]
    lexicon = sentiment_lexicon_map[sentiment]
    with open(path) as file:
        for line in file:
            word = line.rstrip()
            lexicon.add(word)

### Harvard Lexicon
Running this cell is a good bit spicier. We get nearly 12,000 words and a number of more complex sentiments related to them. Mega long list proceeds of Harvard 2000; RIP Philip Stone.

In [46]:
sentiments = ['Positiv', 'Negativ', 'Strong', 'Power', 'Weak', 'Submit', 'Active', 'Passive', 'Pleasur', 'Pain', 'Feel', 'Arousal', 'EMOT', 'Virtue', 'Vice', 'Ovrst', 'Undrst']
sentiment_lexicon_map = {s: set() for s in sentiments}

lexicon_df = read_excel(data_path + '/harvard_lexicon.xls')

for _, row in lexicon_df.iterrows():
    word = str(row['Entry']).lower()
    if not word.isalpha():
        continue
    for sentiment in sentiments:
        if row[sentiment] == sentiment:
            lexicon = sentiment_lexicon_map[sentiment]
            lexicon.add(word)
            
sentiment_lexicon_map

{'Positiv': {'elate',
  'obtain',
  'chic',
  'mellow',
  'awareness',
  'forgiven',
  'empower',
  'marital',
  'brilliance',
  'compensation',
  'noble',
  'boldness',
  'consensus',
  'celebration',
  'refine',
  'proprietary',
  'faithfulness',
  'accolade',
  'immaculate',
  'backbone',
  'proficient',
  'comely',
  'endow',
  'admiration',
  'glee',
  'fidelity',
  'reinforcement',
  'enchantment',
  'infallible',
  'flattery',
  'chivalrous',
  'paradise',
  'amazing',
  'affiliate',
  'comestible',
  'constructive',
  'bravery',
  'independence',
  'commodious',
  'flatter',
  'romantic',
  'advocacy',
  'unforgettable',
  'fresh',
  'accrue',
  'softness',
  'coordination',
  'salute',
  'courteous',
  'resourceful',
  'prosper',
  'gorgeous',
  'flair',
  'beneficial',
  'ingenuity',
  'protective',
  'propitious',
  'absorption',
  'justify',
  'potency',
  'eminence',
  'mighty',
  'affirm',
  'ethical',
  'myriad',
  'tempt',
  'prosecute',
  'conquer',
  'tactics',
  'eff

### Import and clean song lyrics

In [17]:
song = ''
with open("noneshallpass.txt", "r") as f:
    for line in f:
        line = line.rstrip().lower()
        song += line + ' '

# Remove stop words and non-alphabetical tokens
stop_words = set(stopwords.words('english'))
extra_stop_words = ["n't", "'s", "'m", "``", "'", '"', '.', ","]
for word in extra_stop_words:
    stop_words.add(word)
song_words = word_tokenize(song)
song_words = [w for w in song_words if w not in stop_words and w.isalpha()]

song_words

['flash',
 'buttery',
 'gold',
 'jittery',
 'zeitgeist',
 'wither',
 'watering',
 'hole',
 'patrol',
 'heart',
 'huckabee',
 'art',
 'fuckery',
 'suddenly',
 'enough',
 'young',
 'lung',
 'water',
 'wings',
 'colorfully',
 'vulgar',
 'poacher',
 'mulch',
 'like',
 'pull',
 'pulse',
 'soldier',
 'bolt',
 'fine',
 'sign',
 'time',
 'elapse',
 'primate',
 'climb',
 'spine',
 'attach',
 'eye',
 'eye',
 'bog',
 'life',
 'swamps',
 'vines',
 'get',
 'rise',
 'frogs',
 'flies',
 'dogfight',
 'prize',
 'sort',
 'costs',
 'life',
 'mouths',
 'water',
 'fork',
 'knife',
 'allure',
 'right',
 'score',
 'beach',
 'cash',
 'cow',
 'actually',
 'beef',
 'blood',
 'turns',
 'wine',
 'leak',
 'police',
 'like',
 'riot',
 'feast',
 'let',
 'eat',
 'remember',
 'name',
 'face',
 'day',
 'judged',
 'funhouse',
 'cast',
 'rejoice',
 'fall',
 'grace',
 'cane',
 'sky',
 'like',
 'none',
 'shall',
 'pass',
 'none',
 'shall',
 'pass',
 'none',
 'shall',
 'pass',
 'never',
 'day',
 'snow',
 'cone',
 'could',
 

## Bag of Words Analysis

In [47]:
scores = {s: 0 for s in sentiments}

for word in song_words:
    for sentiment in sentiments:
        lexicon = sentiment_lexicon_map[sentiment]
        if word in lexicon:
            scores[sentiment] += 1
            
scaled_scores = {key: val / len(song_words) for key, val in scores.items()}
            
scores

{'Positiv': 12,
 'Negativ': 15,
 'Strong': 17,
 'Power': 4,
 'Weak': 5,
 'Submit': 3,
 'Active': 14,
 'Passive': 9,
 'Pleasur': 3,
 'Pain': 1,
 'Feel': 0,
 'Arousal': 0,
 'EMOT': 1,
 'Virtue': 6,
 'Vice': 4,
 'Ovrst': 4,
 'Undrst': 2}