# Setup

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

from pandas import read_excel

data_path = './sentiment_analysis_data'

## Import data

### Hu & Liu Lexicon
This cell gets lexicons with a simple flavor, pulling in about 6,800 positive and negative words from a study by Minqing Hu and Bing Liu.

In [13]:
sentiments = ['Positive', 'Negative']
sentiment_path_map = {
    sentiments[0]: data_path + '/hu_positive_lexicon.txt',
    sentiments[1]: data_path + '/hu_negative_lexicon.txt',
}
sentiment_lexicon_map = {s: set() for s in sentiments}

for sentiment in sentiments:
    path = sentiment_path_map[sentiment]
    lexicon = sentiment_lexicon_map[sentiment]
    with open(path) as file:
        for line in file:
            word = line.rstrip()
            lexicon.add(word)

### Harvard Lexicon
Running this cell is a good bit spicier. We get nearly 12,000 words and a number of more complex sentiments related to them. Mega long list proceeds of Harvard 2000; RIP Philip Stone.

In [4]:
sentiments = ['Positiv', 'Negativ', 'Strong', 'Power', 'Weak', 'Submit', 'Active', 'Passive', 'Pleasur', 'Pain', 'Feel', 'Arousal', 'EMOT', 'Virtue', 'Vice', 'Ovrst', 'Undrst']
sentiment_lexicon_map = {s: set() for s in sentiments}

lexicon_df = read_excel(data_path + '/harvard_lexicon.xls')

for _, row in lexicon_df.iterrows():
    word = str(row['Entry']).lower()
    if not word.isalpha():
        continue
    for sentiment in sentiments:
        if row[sentiment] == sentiment:
            lexicon = sentiment_lexicon_map[sentiment]
            lexicon.add(word)
            
# sentiment_lexicon_map

### Import and clean song lyrics

In [15]:
song = ''
path = data_path + "/who_shot_ya.txt"
# path = data_path + "/juicy.txt"
with open(path, "r") as f:
    for line in f:
        line = line.rstrip().lower()
        song += line + ' '

# Remove stop words and non-alphabetical tokens
stop_words = set(stopwords.words('english'))
extra_stop_words = ["n't", "'s", "'m", "``", "'", '"', '.', ","]
for word in extra_stop_words:
    stop_words.add(word)
song_words = word_tokenize(song)
song_words = [w for w in song_words if w not in stop_words and w.isalpha()]

song_words

['proceed',
 'give',
 'need',
 'uh',
 'motherfuckers',
 'get',
 'live',
 'motherfuckers',
 'proceed',
 'give',
 'need',
 'motherfuckers',
 'get',
 'live',
 'motherfuckers',
 'turn',
 'mics',
 'proceed',
 'give',
 'need',
 'turn',
 'mic',
 'yeah',
 'beat',
 'knocking',
 'need',
 'mic',
 'though',
 'turn',
 'shit',
 'fuck',
 'east',
 'coast',
 'motherfuckers',
 'uh',
 'bad',
 'boy',
 'motherfuckers',
 'turn',
 'louder',
 'yeah',
 'uh',
 'proceed',
 'give',
 'need',
 'motherfuckers',
 'motherfuckers',
 'uh',
 'motherfuckers',
 'uh',
 'verse',
 'notorious',
 'shot',
 'ya',
 'separate',
 'weak',
 'obsolete',
 'hard',
 'creep',
 'brooklyn',
 'streets',
 'nigga',
 'fuck',
 'bickerin',
 'beef',
 'hear',
 'sweat',
 'tricklin',
 'cheek',
 'heartbeat',
 'sound',
 'like',
 'sasquatch',
 'feet',
 'thunderin',
 'shakin',
 'concrete',
 'shit',
 'stop',
 'foil',
 'plot',
 'neighbors',
 'call',
 'cops',
 'said',
 'heard',
 'mad',
 'shots',
 'saw',
 'drop',
 'three',
 'quarter',
 'slaughter',
 'electric

## Bag of Words Analysis

In [16]:
scores = {s: 0 for s in sentiments}

for word in song_words:
    for sentiment in sentiments:
        lexicon = sentiment_lexicon_map[sentiment]
        if word in lexicon:
            scores[sentiment] += 1
            
scaled_scores = {key: val / len(song_words) for key, val in scores.items()}
            
scores

{'Positive': 7, 'Negative': 27}