<h4> Function to get a list of random values between an interval </h4>

In [34]:
from random import randint

def randomValues (begin, end, length, allowedToRepeat = False):
    if end < begin: 
        return []
    
    values = []
    if allowedToRepeat:
        while (len(values) < length):
            values.append(randint(begin,end))
    else:
        if (end-begin+1) >= length:
            while (len(values) < length):
                i = randint(begin, end)
                while i in values:
                    i = randint(begin,end)
                values.append(i)
    
    return values

<h4> Functions to stem and remove stop words. </h4>

In [35]:
from nltk.stem.porter import PorterStemmer

def stemSentence(sentence):
    
    if not sentence: 
        return sentence
    
    stemmer = PorterStemmer(mode=PorterStemmer.ORIGINAL_ALGORITHM)
    stemmedWords = list(map(stemmer.stem, sentence.split(' ')))
    stemmedSentence = stemmedWords[0]
    
    for i in range(1, len(stemmedWords)):
        stemmedSentence += ' ' + stemmedWords[i]
        
    return stemmedSentence

def removeStopwordsFromSentence(sentence, stopwords = []):
    
    if not sentence or not stopwords:
        return sentence
    
    sentence = sentence.lower()
    
    newString = ''
    for c in sentence:
        newString += c if c.isalnum() else ' '
    sentence = newString
    
    words = sentence.split(' ')
    clearSentence = ''
    
    for i in range(0, len(words)):
        
        word = words[i]
        if clearSentence and clearSentence[-1] != ' ':
            clearSentence += ' '
        clearSentence += word if word not in stopwords else ''
        
    return clearSentence

<h4> Opening general log file </h4>

In [36]:
generalLogFile = open('general_log_file.txt', 'w')
generalLogFile.write('GENERAL LOG FILE\n\n')

18

<h4> Reading music lyrics files </h4>

In [37]:
from pathlib import Path
import collections, numpy, operator
from operator import itemgetter

p = Path('./')
allMusicFiles = list(p.glob('**/*.txt'))
print('Total of music files: ', len(allMusicFiles))

generalLogFile.write('Total of music files: '+str(len(allMusicFiles))+'\n\n')

p = Path('./2 - English')
allMusicLyrics = []
allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files in English: ',len(allMusicFiles))

generalLogFile.write('Total of music files in English: '+str(len(allMusicFiles))+'\n\n')

sampleSize = 5000
# sampleSize = -1

generalLogFile.write('Sample size: '+str(sampleSize)+'\n\n')

if sampleSize != -1:
    randomIndexes = randomValues(0,len(allMusicFiles)-1,sampleSize)
    allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)
    sampleSize = len(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Total of music files:  52683
Total of music files in English:  25693
Amount of music files:  5000
Amount of lyrics:  5000


<h4> Reading stop words dictionary file. </h4> 

In [38]:
dictionaryFile = open('NewDictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = []
for word in dictionaryContent.split(','):
    if '"' in word:
        dictionaryList.append(word.replace('"',''))
    else:
        dictionaryList.append(word.replace('\'',''))

generalLogFile.write('Amount of dictionary words: '+str(len(dictionaryList))+'\n\n')

print('Amount of dictionary words: ', len(dictionaryList))
print(dictionaryList)

Amount of dictionary words:  2696
['a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', "aren't", 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'below', 'best', 'better', 'between', 'big', 'both', 'but', 'by', 'c', 'came', 'can', 'cannot', "can't", 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', "couldn't", 'd', 'did', "didn't", 'differ', 'different', 'differently', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downed', 'downing', 'downs', 'during', 'e', 'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone', '

<h4> Removing stop words and stemming music files. </h4>

In [39]:
totalOfWords = 0
totalOfRemovedWords = 0
clearedMusics = []

for music in allMusicLyrics:
    
    # Getting and acumulating amount of words in original music file
    
    amountOfWords = len(music.split(' '))
    totalOfWords += amountOfWords
    
    # Removing stop words and stemming music lyrics
    
    clearMusic = removeStopwordsFromSentence(music, dictionaryList)
    stemmedMusic = stemSentence(clearMusic)
    
    # Getting and acumulating amount of reduced words 
    
    newAmountOfWords = len(stemmedMusic.split(' '))
    reducedWords = amountOfWords - newAmountOfWords
    totalOfRemovedWords += reducedWords
    
    # Storing the new cleared music file 
    
    clearedMusics.append(stemmedMusic)
    
generalLogFile.write('Amount of words in original music files: '+str(totalOfWords)+'\n\n')
generalLogFile.write('Amount of removed words: '+str(totalOfRemovedWords)+'\n\n')

print('Amount of words in original music files: ', totalOfWords)
print('Amount of removed words: ', totalOfRemovedWords)

Amount of words in original music files:  1305004
Amount of removed words:  801727


<h4> Bag of Words </h4> 

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bagOfWordsResult = vectorizer.fit_transform(clearedMusics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)

['aa', 'aah', 'aahh', 'aahhaahh', 'aai', 'aakitekushaa', 'aaliyah', 'aanashi', 'aand', 'aankhiya', 'aankhiyaan', 'aankhiyo', 'aaron', 'aasu', 'aaw', 'aay', 'aayliah', 'ab', 'abaddon', 'abadi', 'abadikan', 'abaku', 'abandon', 'abaredasu', 'abat', 'abbastanza', 'abbei', 'abbracciar', 'abbras', 'abc', 'abdomen', 'abdomin', 'abduct', 'abed', 'abel', 'abellida', 'abertura', 'abhor', 'abhorr', 'abid', 'abierta', 'abigail', 'abil', 'abit', 'abitta', 'abl', 'ablaz', 'ablov', 'abnorm', 'abo', 'aboard', 'abod', 'abolish', 'abomin', 'abord', 'abort', 'abound', 'abra', 'abraham', 'abras', 'abraçado', 'abraçar', 'abridg', 'abro', 'abroad', 'abrumadora', 'abruptli', 'absalom', 'abscenc', 'abscess', 'absenc', 'absent', 'absinth', 'absolut', 'absorb', 'abstain', 'abstin', 'abstract', 'abstrair', 'absu', 'absurd', 'abund', 'abus', 'abusin', 'abut', 'abysm', 'abyss', 'ac', 'aca', 'acab', 'acaba', 'academi', 'academia', 'acapella', 'acapulco', 'acceler', 'accent', 'accept', 'accesori', 'access', 'accesso

<h4> TF-IDF </h4>

In [41]:
from sklearn.feature_extraction.text import *

transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[6.95244381 7.57148302 8.82424599 ... 8.82424599 8.82424599 8.82424599]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
def keyWordsOfEachGroup(musics, groups, amountOfGroups, amountOfKeyWords = 30):
    
    musicsOfEachGroup = []
    keyWrodsOfEachGroup = []
    for i in range(0, amountOfGroups):
        musicsOfEachGroup.append('')
        
    for i in range(0, len(groups)):
        group = groups[i]
        musicsOfEachGroup[group] += ' ' + musics[i]
        
    for moeg in musicsOfEachGroup:
        
        allWords = sorted(moeg.split(' '))
        filtered = filter(lambda a: a != '', allWords)
        counter = collections.Counter(filtered)
        counter = sorted(counter.items(), key=operator.itemgetter(1))
        amountOfWords = amountOfKeyWords if len(allWords) >= amountOfKeyWords else len(allWords)
        mostFrequentWords = []
        lessFrequentWords = []
        j = len(counter)-1
        for i in range(0, amountOfWords):
            if i >= len(counter):
                break
            mostFrequentWords.append(counter[j])
            lessFrequentWords.append(counter[i])
            j-=1
        keyWrodsOfEachGroup.append([mostFrequentWords, lessFrequentWords])
        
    return keyWrodsOfEachGroup

<h4> Agrupamento KMeans </h4> 

In [None]:
from sklearn.cluster import KMeans
import collections, numpy, operator
from operator import itemgetter
    
centroids = [2, 4, 8, 16] #[2, 4, 8, 16]

for c in centroids:
    
    kmeans = KMeans(n_clusters = c).fit(normalizedResult)
    results = kmeans.labels_
    
    print(results)

#     OPENING LOG FILE

    groupResultFile = open('kmeans_result_'+str(c)+'_centroids.txt', 'w')
    groupResultFile.write('KMeans result with '+str(c)+' centroids.\n\n')

#     DECLARING HELP VARIABLES

    amountOfMusicsOfEachGroup = collections.Counter(results)

    lastEnd = -1
    for k, v in sorted(amountOfMusicsOfEachGroup.items()):

        # SAVING AMOUNT OF REGISTER OF EACH GROUP IN LOG FILE

        groupResultFile.write('Grupo '+str(k+1)+': '+str(v)+' registros.\n')
        start = -1
        end = -1
        if k > 0:
            start = lastEnd
            end = lastEnd + v - 1
            lastEnd = lastEnd + v
        else:
            start = 0 
            end = v-1 
            lastEnd = v
            
    groupResultFile.write('\nKEY WORDS \n')
            
    amountOfKeyWords = 30
    groupsKeyWords = keyWordsOfEachGroup(clearedMusics, results, c, amountOfKeyWords)
    
    for i in range(0,len(groupsKeyWords)):
        
        keyWords = groupsKeyWords[i]
    
        groupResultFile.write('\n  Group '+str(i)+' \n')
    
        groupResultFile.write('\n    '+str(amountOfKeyWords)+' most frequent words: ')
        
        print('      ')

        for w, f in keyWords[0]:
            groupResultFile.write(str(w)+' ('+str(f)+'); ')

        groupResultFile.write('\n\n    '+str(amountOfKeyWords)+' less frequent words: ')
        
        print('      ')

        for w, f in keyWords[1]:
            groupResultFile.write(str(w)+' ('+str(f)+'); ')
            
        groupResultFile.write('\n')
        
#     STORING THE NAME OF EACH FILE IN AN ARRAY TO BE MANIPULATED LATER

#     The name pattern for each music file is: g - entire_document_name 
#     where g = the group of the file and 
#     entire_document_name = is the intire path plus the name of music file

    musicNames = [''] * sampleSize
    for i in range(0, sampleSize):
        musicNames[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n\n'
    musicNames.sort()
    
    analizedMusicNamesOfEachGroup = []
    start = 0
    end = 0
    for g in range(0, c):

        # CHOOSING AND STORING n RANDOM MUSICS OF EACH GROUP TO BE ANALIZED MANUALLY
        amountOfAnalizedMusicsOfEachGroup = 5
        
        start = end
        end = start + amountOfMusicsOfEachGroup[g]

        randomIndexes = []
        if amountOfMusicsOfEachGroup[g] <= amountOfAnalizedMusicsOfEachGroup:
            randomIndexes = range(start, end)
        else:
            randomIndexes = randomValues(start, end-1, amountOfAnalizedMusicsOfEachGroup)
            
        print(randomIndexes)

        analizedMusicNamesOfEachGroup.append(itemgetter(*randomIndexes)(musicNames))

#     SAVING THE MOST AND THE LESS FREQUENT WORDS OF EACH GROUP

    groupResultFile.write('\nFiles to be analized for each group\n')
    for i in range(0, c):
        groupResultFile.write('\nGroup '+str(i+1)+': \n\n')
        for musicName in analizedMusicNamesOfEachGroup[i]:
            groupResultFile.write(musicName+'\n')

    groupResultFile.close()

[0 1 0 ... 0 1 1]
      
      
      
      
[733, 466, 1159, 160, 183]
[2837, 3204, 4797, 2279, 1719]
[1 1 1 ... 1 1 3]
      
      
      
      
      
      
      
      
[240, 262, 139, 210, 284]
[1438, 1449, 1729, 988, 1344]
[1837, 1980, 1939, 2109, 1965]
[2493, 3654, 2857, 3688, 4802]
[3 4 3 ... 3 4 2]
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
[33, 38, 48, 23, 52]
[240, 211, 61, 229, 301]
[514, 2351, 2591, 480, 1214]
[3063, 2763, 3555, 2934, 3501]
[4093, 4279, 3801, 4231, 3771]
[4497, 4509, 4431, 4469, 4414]
[4692, 4712, 4654, 4695, 4664]
[4967, 4866, 4924, 4767, 4909]


<h4> Closing general log file. </h4>

In [None]:
generalLogFile.close()