<h4> Function to get a list of random values between an interval </h4>

In [18]:
from random import randint

def randomValues (begin, end, length, allowedToRepeat = False):
    if end < begin: 
        return []
    
    values = []
    if allowedToRepeat:
        while (len(values) < length):
            values.append(randint(begin,end))
    else:
        if (end-begin+1) >= length:
            while (len(values) < length):
                i = randint(begin, end)
                while i in values:
                    i = randint(begin,end)
                values.append(i)
    
    return values

<h4> Functions to stem and remove stop words. </h4>

In [19]:
from nltk.stem.porter import PorterStemmer

def stemSentence(sentence):
    
    if not sentence: 
        return sentence
    
    stemmer = PorterStemmer(mode=PorterStemmer.ORIGINAL_ALGORITHM)
    stemmedWords = list(map(stemmer.stem, sentence.split(' ')))
    stemmedSentence = stemmedWords[0]
    
    for i in range(1, len(stemmedWords)):
        stemmedSentence += ' ' + stemmedWords[i]
        
    return stemmedSentence

def removeStopwordsFromSentence(sentence, stopwords = []):
    
    if not sentence or not stopwords:
        return sentence
    
    sentence = sentence.lower()
    
    newString = ''
    for c in sentence:
        newString += c if c.isalnum() else ' '
    sentence = newString
    
    words = sentence.split(' ')
    clearSentence = ''
    
    for i in range(0, len(words)):
        
        word = words[i]
        if clearSentence and clearSentence[-1] != ' ':
            clearSentence += ' '
        clearSentence += word if word not in stopwords else ''
        
    return clearSentence

<h4> Opening general log file </h4>

In [20]:
generalLogFile = open('general_log_file.txt', 'w')
generalLogFile.write('GENERAL LOG FILE\n\n')

18

<h4> Reading music lyrics files </h4>

In [21]:
from pathlib import Path

p = Path('./')
allMusicFiles = list(p.glob('**/*.txt'))
print('Total of music files: ', len(allMusicFiles))

generalLogFile.write('Total of music files: '+str(len(allMusicFiles))+'\n\n')

p = Path('./2 - English')
allMusicLyrics = []
allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files in English: ',len(allMusicFiles))

generalLogFile.write('Total of music files in English: '+str(len(allMusicFiles))+'\n\n')

sampleSize = 1000

generalLogFile.write('Sample size: '+str(sampleSize)+'\n\n')

if sampleSize != -1:
    randomIndexes = randomValues(0,len(allMusicFiles)-1,sampleSize)
    allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)
    sampleSize = len(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Total of music files:  52671
Total of music files in English:  25693
Amount of music files:  1000
Amount of lyrics:  1000


<h4> Reading stop words dictionary file. </h4> 

In [22]:
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

generalLogFile.write('Amount of dictionary words: '+str(len(dictionaryList))+'\n\n')

print('Amount of dictionary words: ', len(dictionaryList))

Amount of dictionary words:  1434


<h4> Removing stop words and stemming music files. </h4>

In [23]:
totalOfWords = 0
totalOfRemovedWords = 0
clearedMusics = []

for music in allMusicLyrics:
    
    # Getting and acumulating amount of words in original music file
    
    amountOfWords = len(music.split(' '))
    totalOfWords += amountOfWords
    
    # Removing stop words and stemming music lyrics
    
    clearMusic = removeStopwordsFromSentence(music, dictionaryList)
    stemmedMusic = stemSentence(clearMusic)
    
    # Getting and acumulating amount of reduced words 
    
    newAmountOfWords = len(stemmedMusic.split(' '))
    reducedWords = amountOfWords - newAmountOfWords
    totalOfRemovedWords += reducedWords
    
    # Storing the new cleared music file 
    
    clearedMusics.append(stemmedMusic)
    
generalLogFile.write('Amount of words in original music files: '+str(totalOfWords)+'\n\n')
generalLogFile.write('Amount of removed words: '+str(totalOfRemovedWords)+'\n\n')

print('Amount of words in original music files: ', totalOfWords)
print('Amount of removed words: ', totalOfRemovedWords)

Amount of words in original music files:  265454
Amount of removed words:  108579


<h4> Bag of Words </h4> 

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bagOfWordsResult = vectorizer.fit_transform(clearedMusics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)

['ab', 'abaku', 'abandon', 'abdomen', 'abdomin', 'abduct', 'abductor', 'aberu', 'abhor', 'abil', 'abl', 'ablaz', 'abolit', 'abort', 'abound', 'about', 'abov', 'abraham', 'abras', 'abruptli', 'absenc', 'absent', 'absolut', 'absorb', 'abstin', 'abstract', 'absurd', 'abunai', 'abund', 'abus', 'abyss', 'ac', 'acaba', 'acceler', 'accept', 'access', 'acci', 'accident', 'acclaim', 'accomplish', 'accord', 'account', 'accur', 'accus', 'accustom', 'ach', 'achiev', 'achil', 'achimbuteo', 'achimul', 'achimy', 'achin', 'acid', 'ackinickul', 'acknowledg', 'acorn', 'acoust', 'acquir', 'across', 'act', 'actin', 'action', 'activ', 'actor', 'actual', 'ad', 'ada', 'adam', 'add', 'addict', 'addit', 'addl', 'address', 'adjourn', 'adjust', 'admir', 'admit', 'ado', 'adolesc', 'ador', 'adorn', 'adrenalin', 'adri', 'adult', 'advanc', 'advantag', 'adventur', 'advers', 'advic', 'advil', 'advisori', 'advoc', 'aeba', 'aeon', 'aereul', 'aerob', 'aeroplan', 'aetag', 'afair', 'afer', 'affect', 'affianc', 'affili', 'a

<h4> TF-IDF </h4>

In [25]:
from sklearn.feature_extraction.text import *

transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.2156076  6.81014249 5.42384813 ... 7.2156076  7.2156076  7.2156076 ]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
def keyWordsOfEachGroup(musics, groups, amountOfGroups, amountOfKeyWords = 30):
    
    musicsOfEachGroup = []
    keyWrodsOfEachGroup = []
    for i in range(0, amountOfGroups):
        musicsOfEachGroup.append('')
        
    for i in range(0, len(groups)):
        group = groups[i]
        musicsOfEachGroup[group] += ' ' + musics[i]
        
    for moeg in musicsOfEachGroup:
        
        allWords = sorted(moeg.split(' '))
        filtered = filter(lambda a: a != '', allWords)
        counter = collections.Counter(filtered)
        counter = sorted(counter.items(), key=operator.itemgetter(1))
        amountOfWords = amountOfKeyWords if len(allWords) >= amountOfKeyWords else len(allWords)
        mostFrequentWords = []
        lessFrequentWords = []
        j = len(counter)-1
        for i in range(0, amountOfWords):
            mostFrequentWords.append(counter[j])
            lessFrequentWords.append(counter[i])
            j-=1
        keyWrodsOfEachGroup.append([mostFrequentWords, lessFrequentWords])
        
    return keyWrodsOfEachGroup

<h4> Agrupamento KMeans </h4> 

In [37]:
from sklearn.cluster import KMeans
import collections, numpy, operator
from operator import itemgetter
    
centroids = [2, 4, 8, 16] #[2, 4, 8, 16]

for c in centroids:
    kmeans = KMeans(n_clusters = c).fit(normalizedResult)
    results = kmeans.labels_
    
    print(results)

    # OPENING LOG FILE

    groupResultFile = open('kmeans_result_'+str(c)+'_centroids.txt', 'w')
    groupResultFile.write('KMeans result with '+str(c)+' centroids.\n\n')

    # DECLARING HELP VARIABLES

    amountOfMusicsOfEachGroup = collections.Counter(results)

    lastEnd = -1
    for k, v in sorted(amountOfMusicsOfEachGroup.items()):

        # SAVING AMOUNT OF REGISTER OF EACH GROUP IN LOG FILE

        groupResultFile.write('Grupo '+str(k+1)+': '+str(v)+' registros.\n')
        start = -1
        end = -1
        if k > 0:
            start = lastEnd
            end = lastEnd + v - 1
            lastEnd = lastEnd + v
        else:
            start = 0 
            end = v-1 
            lastEnd = v
            
    groupResultFile.write('\nKEY WORDS \n')
            
    amountOfKeyWords = 5
    groupsKeyWords = keyWordsOfEachGroup(clearedMusics, results, c, amountOfKeyWords)
    
    for i in range(0,len(groupsKeyWords)):
        
        keyWords = groupsKeyWords[i]
    
        groupResultFile.write('\n  Group '+str(i)+' \n')
    
        groupResultFile.write('\n    '+str(amountOfKeyWords)+' most frequent words: ')
        
        print('      ')

        for w, f in keyWords[0]:
            groupResultFile.write(str(w)+' ('+str(f)+'); ')

        groupResultFile.write('\n\n    '+str(amountOfKeyWords)+' less frequent words: ')
        
        print('      ')

        for w, f in keyWords[1]:
            groupResultFile.write(str(w)+' ('+str(f)+'); ')
            
        groupResultFile.write('\n')
        
    # STORING THE NAME OF EACH FILE IN AN ARRAY TO BE MANIPULATED LATER

    # The name pattern for each music file is: g - entire_document_name 
    # where g = the group of the file and 
    # entire_document_name = is the intire path plus the name of music file

    musicNames = [''] * sampleSize
    for i in range(0, sampleSize):
        musicNames[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n\n'
    musicNames.sort()
    
    analizedMusicNamesOfEachGroup = []
    start = 0
    end = 0
    for g in range(0, c):

        # CHOOSING AND STORING n RANDOM MUSICS OF EACH GROUP TO BE ANALIZED MANUALLY
        amountOfAnalizedMusicsOfEachGroup = 5
        
        start = end
        end = start + amountOfMusicsOfEachGroup[g]

        randomIndexes = []
        if amountOfMusicsOfEachGroup[g] <= amountOfAnalizedMusicsOfEachGroup:
            randomIndexes = range(start, end)
        else:
            randomIndexes = randomValues(start, end-1, amountOfAnalizedMusicsOfEachGroup)
            
        print(randomIndexes)

        analizedMusicNamesOfEachGroup.append(itemgetter(*randomIndexes)(musicNames))

    # SAVING THE MOST AND THE LESS FREQUENT WORDS OF EACH GROUP

    groupResultFile.write('\nFiles to be analized for each group\n')
    for i in range(0, c):
        groupResultFile.write('\nGroup '+str(i+1)+': \n\n')
        for musicName in analizedMusicNamesOfEachGroup[i]:
            groupResultFile.write(musicName+'\n')

    groupResultFile.close()

[0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0
 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 1
 0 0 1 1 1 0 0 0 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0
 1 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0
 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0
 1 0 1 1 1 1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 1 0
 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 1
 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 1 1 0
 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0
 1 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1
 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0
 0 0 1 0 1 0 0 0 0 0 0 1 

<h4> Closing general log file. </h4>

In [None]:
generalLogFile.close()