# Song Lyric Topic Modelling and Summarization
Source: https://towardsdatascience.com/nlp-for-topic-modeling-summarization-of-legal-documents-8c89393b1534

In [None]:
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd
import re


from os import listdir, path
from PIL import Image
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from wordcloud import WordCloud,STOPWORDS

%pylab
%matplotlib inline

## Training Parameters

In [None]:
paths = ['data/lyrics/2002',
         'data/lyrics/2003',
         'data/lyrics/2004',
         'data/lyrics/2005',
         #'data/lyrics/2006',
         'data/lyrics/2007',
         'data/lyrics/2008',
         'data/lyrics/2009',
         'data/lyrics/2010',
         'data/lyrics/2011',
         'data/lyrics/2012',
         'data/lyrics/2013',
         'data/lyrics/2014',
         'data/lyrics/2015',
         'data/lyrics/2016',
         'data/lyrics/2017']
stop_chars = ['(', ')', '{', '}', '.', '\'', ',', '-']
stop_words = ['la', 'ah', 'da', 'ye', 'oh', 'ba', 'na', 'ooh', 'aah', 'dont', 'yeah' 
              ''', 'im', 'gonna', 'cause', 'know' ''']

number_of_topics = 2
number_of_top_words = 2
number_of_top_sentences = 2

word_cloud_max_words = 100
word_cloud_mask_filename = 'word_cloud_mask.png'
word_cloud_background_color = 'black'

## Method Definitions

In [None]:
def getLyricData(path, stop_chars, stop_words, debug=False):
    files = listdir(path)
    if debug:
        print(files, '\n\n')
    
    lyrics = None
    for file in files:
        with open(path + '/' + file, encoding='utf-8') as f:
            if lyrics is None:
                lyrics = f.read().splitlines()
            else:
                lyrics.extend(f.read().splitlines())
                
    if debug:
        print(lyrics, '\n\n')

    dubby = []
    removeCharsRe = '[' + re.escape(''.join(stop_chars)) + ']'
    for l in lyrics:
        words = re.sub(removeCharsRe, '', l.replace(',', ', ').replace('-', '- ')).split()
        valid_words = [w for w in words if w.lower() not in stop_words]
        valid_lyric = ' '.join(valid_words)
        dubby.append(valid_lyric)

    if debug:
        print(dubby, '\n\n')
        
    dubby = [d for d in dubby if d != '']
    dubby = list(set(dubby)) # remove duplicates
    return dubby

In [None]:
def runTopicModelling_LDA(inputData, number_of_topics):
    vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
    dtm=vect.fit_transform(inputData)
    
    lda=LatentDirichletAllocation(n_components=number_of_topics)
    lda_dtf=lda.fit_transform(dtm)
    
    sorting=np.argsort(lda.components_)[:,::-1]
    feature_names=np.array(vect.get_feature_names())
    
    return lda_dtf, feature_names, sorting

In [None]:
def getTopicTopSentences(data, predictions, predictionIdx, sentencesCnt):
    sentences = []
    topic = np.argsort(predictions[:,predictionIdx])[::-1]
    for t in topic[:sentencesCnt]:
        sentences.append(".".join(data[t].split(".")[:2]))
        
    return sentences

In [None]:
def displayWordCloud(inputData, maskFileName, maxWords, backgroundColor):
    text = ' '.join(inputData)
    mask = np.array(Image.open(maskFileName))
    stopwords = set(STOPWORDS)
    
    wc = WordCloud(background_color=backgroundColor, max_words=maxWords, mask=mask,stopwords=stopwords)
    wc.generate(text)
    
    plt.figure(figsize=(16,13))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.figure()
    plt.axis("off")
    plt.show()

## Execute Model

In [None]:
for path in paths:
    print('\n\nPATH:', path)
    print('==================================================')
    lyricData = getLyricData(path, stop_chars, stop_words)

    predictions, feature_names, sorting = runTopicModelling_LDA(lyricData, number_of_topics)

    print('')
    mglearn.tools.print_topics(topics=range(number_of_topics), feature_names=feature_names,
                               sorting=sorting, topics_per_chunk=number_of_topics, 
                               n_words=number_of_top_words)

    for i in range(number_of_topics):
        topSentences = getTopicTopSentences(lyricData, predictions, i, number_of_top_sentences)
        print('Topic', i)
        for s in topSentences:
            print('    ', s)
        print('')
    
    displayWordCloud(lyricData, word_cloud_mask_filename, word_cloud_max_words, word_cloud_background_color)