In [1]:
# GetInTheMood 1.0
# Project by Ishan Khurjekar and Radhika Saraf

In [2]:
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import*
from bs4 import BeautifulSoup
from nltk.tag import CRFTagger
from sklearn.metrics import classification_report

In [3]:
# To eliminate stop words 

stopWords = set(stopwords.words('english'))

In [4]:
## Extracting data from .htm webpages 

def lyrics(input_files):

    song_lyrics = []
    
    for fileno in input_files:
        with open(fileno) as html_doc:
            soup = BeautifulSoup(html_doc, 'html.parser')
        texts = []
    
        # Look  for the 'p' element - this will change depending on the format of the website 
        for elem in soup.find_all("p"):
        
            # This label is particular to the chosen website
            if elem.attrs == {u'class': [u'verse']}:
                lyrics = elem.text.lower()
                tokens = word_tokenize(lyrics)
                for word in tokens:
                    texts.append(word)
        song_lyrics.append(texts)
    
    return song_lyrics

In [5]:
def dictionary(train_lyrics):
    
## Parts of speech tagging 
    tagged = []
    
    for doc in train_lyrics:
        tagged.append(nltk.pos_tag(doc,tagset='universal'))
        
## Tag the training data with the two labels : 'happy' and 'sad'
# This is done based on the fact that the author perceives the chosen songs as either 'happy' or 'sad'

    docs = []

    for doc in range(4,8):
        texts = []
        for (word,label) in tagged[doc]:
            if word not in stopWords:
                if label != '.' and label != 'PRT' and label != 'X':
                    texts.append((word,'happy'))
        docs.append(texts)

    for doc in range(4):
        texts = []
        for (word,label) in tagged[doc]:
            if word not in stopWords:
                if label != '.' and label != 'PRT' and label != 'X':
                    texts.append((word,'sad'))
        docs.append(texts)
        
## Create a mood dictionary from the most frequently occuring words in the happy/sad songs

    fdist = FreqDist(docs[0])
    for doc in docs:
        fdist_temp = FreqDist(doc)
        fdist |= fdist_temp
    mood = fdist.most_common(80)

    happy = []
    sad = []

    for ((word,label),count) in mood:
        if word not in stopWords:
            if label == 'happy':
                happy.append(word)
            if label == 'sad':
                sad.append(word)
                
# Eliminating labeling bias

    if len(happy) > len(sad):
        happy = happy[0:len(sad)]
    if len(sad) > len(happy):
        sad = sad[0:len(happy)]
        
    return happy,sad

In [6]:
def brute_force_label(docs,happy,sad):
    
    labelled_docs = []
    for doc in docs:
        texts = []
        for word in doc:
            if word in happy:
                label = 'happy'
                couplet = (word,label)
            elif word in sad:
                label = 'sad'
                couplet = (word,label)
            else:
                label = 'I'
                couplet = (word,label)
            texts.append(couplet)
        labelled_docs.append(texts)
        
    return labelled_docs

In [7]:
def evaluation(docs,test_truths):
    tagger_out = []

    for doc in docs:
    
# Using the CRF for labelling
        tagger_out.append(ct.tag(doc))

## Creating the inputs to run the classification report
    
    y_pred = []
    for doc in test_truths:
        for (word,label) in doc:
            y_pred.append(label)
    y_test = []
    for doc in tagger_out:
        for (word,label) in doc:
            y_test.append(label)
            
# Create a mapping of labels to indices
    labels = {"happy" : 0, "sad" : 1, "I" : 2}

# Convert the sequences of tags into a 1-dimensional array
    predictions = np.array([labels[tag] for tag in y_pred])
    truths = np.array([labels[tag] for tag in y_test])

# Print out the classification report
    print(classification_report(
        truths, predictions,
        target_names=["happy","sad",'I']))

    return tagger_out

In [8]:
def mood_calculator(song_lyrics):

    happy_count = 0
    sad_count = 0
    which_list = []
    
    for doc in song_lyrics:
        for (word,label) in doc:
            if label == 'happy':
                happy_count = happy_count + 1
            elif label == 'sad':
                sad_count = sad_count + 1
        mood_ratio = sad_count/happy_count
        
#Predicting if the song should go to a happy list or a sad list 
    
        if mood_ratio < 1 :
            which_list.append('happy')
        else:
            which_list.append('sad')
            
    return which_list

In [9]:
# Webpages stored as HTML only : from metrolyrics.com
train_files = ['myself.htm','summer.htm','cry.htm','stop.htm','celebrate.htm','happy.htm','world.htm','doremi.htm']

train_lyrics = lyrics(train_files)

[happy,sad] = dictionary(train_lyrics)

## Using the dictionary as reference label each word in the song again as either happy or sad

training_docs = brute_force_label(train_lyrics,happy,sad)

## Train the CRF using this data

ct = CRFTagger()
train_data = training_docs
ct.train(train_data,'model.crf.tagger')
ct.set_model_file('model.crf.tagger')

## Extract data for test files

test_files = ['rolling.htm']

test_lyrics = lyrics(test_files)
test_truth = brute_force_label(test_lyrics,happy,sad)

tagged_lyrics = evaluation(test_lyrics,test_truth)

             precision    recall  f1-score   support

      happy       1.00      0.50      0.67         4
        sad       0.96      1.00      0.98        22
          I       1.00      1.00      1.00       634

avg / total       1.00      1.00      0.99       660



In [10]:
# Songs classified as happy or sad
print(zip(test_files,mood_calculator(tagged_lyrics)))

[('rolling.htm', 'sad')]


In [11]:
# Confirm the accuracy by using the CRF tagger to relabel the training documents

test_files = ['myself.htm','summer.htm','cry.htm','stop.htm','celebrate.htm','happy.htm','world.htm','doremi.htm']

test_lyrics = lyrics(test_files)
test_truth = brute_force_label(test_lyrics,happy,sad)

tagged_lyrics = evaluation(test_lyrics,test_truth)

print(zip(test_files,mood_calculator(tagged_lyrics)))

             precision    recall  f1-score   support

      happy       1.00      0.99      1.00       437
        sad       0.92      0.99      0.96       182
          I       1.00      0.99      1.00      2511

avg / total       0.99      0.99      0.99      3130

[('myself.htm', 'sad'), ('summer.htm', 'sad'), ('cry.htm', 'sad'), ('stop.htm', 'sad'), ('celebrate.htm', 'happy'), ('happy.htm', 'happy'), ('world.htm', 'happy'), ('doremi.htm', 'happy')]
