In [342]:
from random import randint

## Function to get a list of random values between a interval

def randomValues (begin, end, length, allowedToRepeat = False):
    if end < begin: 
        return []
    
    values = []
    if allowedToRepeat:
        while (len(values) < length):
            values.append(randint(begin,end))
    else:
        if (end-begin+1) >= length:
            while (len(values) < length):
                i = randint(begin, end)
                while i in values:
                    i = randint(begin,end)
                values.append(i)
    
    return values
            

In [343]:
from pathlib import Path

### READING FILES
p = Path('./2 - English')

allMusicLyrics = []

allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files: ',len(allMusicFiles))

Total of music files:  25693


In [344]:
from random import randint
from operator import itemgetter

## Getting a sample with 1000 musics (randomly)
randomIndexes = randomValues(0,len(allMusicFiles)-1,1000)

allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Amount of music files:  1000
Amount of lyrics:  1000


In [345]:
## Reading dictionary file with stop words - All of them came from the feature_names bag of words result
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

print('Amount of stopwords: ',len(dictionaryList))

Amount of stopwords:  1342


In [346]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *

### REMOVING STOPWORDS AND DOING STEMMING

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(stop_words=dictionaryList)
vectorizer.fit_transform(allMusicLyrics)

# Getting feature_names
featureNames = vectorizer.get_feature_names()
print("Amount of featurenames before stemming: ", len(featureNames))

# Applying Porter Stemming
stemmer = PorterStemmer() 
stemmedWords = list(set(map(stemmer.stem, featureNames)))
print("Amount of featurenames after stemming: ", len(stemmedWords))

print("Reduced words: ", len(featureNames)-len(stemmedWords))

Amount of featurenames before stemming:  14282
Amount of featurenames after stemming:  11184
Reduced words:  3098


In [347]:
### DOING BAG OF WORDS PROCESS

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(vocabulary=stemmedWords)
vectorizer.fit_transform(allMusicLyrics)

bagOfWordsResult = vectorizer.fit_transform(allMusicLyrics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(bagOfWordsResult.toarray())

['kako', 'foamin', 'taco', 'wayn', 'celib', 'daedabeun', 'dorian', 'shin', 'charit', 'abacu', 'oar', 'tough', 'alexand', 'wu', 'kie', 'antonym', 'piano', 'damnit', 'affatu', 'benefit', 'ddo', 'wowo', 'lisa', 'oreno', 'sbcglobal', 'sashita', 'warnin', 'superpow', 'buddha', 'loss', 'smack', 'anbona', 'moss', 'fairli', 'raptur', 'nothinh', 'steadili', 'loot', 'gab', 'xo', 'checker', 'yurameiteita', 'juda', 'natt', 'wakaremo', 'jeen', 'cue', 'ouhgt', 'cafe', 'cujo', 'nightli', 'circumcis', 'leash', 'defibul', 'cuff', 'bare', 'propos', 'melon', 'filter', 'test', 'shell', 'while', 'sydney', 'dummi', 'intellectu', 'payin', 'nanja', 'illegit', 'shitfac', 'diplomat', 'hajiman', 'kawaranai', 'knicker', 'entrench', 'sometim', 'winter', 'comprehend', 'hypnotis', 'hajikidashit', 'internet', 'pet', 'uncut', 'atarashiku', 'burst', 'crueler', 'yard', 'subett', 'ker', 'bozo', 'pour', 'ought', 'ei', 'tryna', 'jamaican', 'emphasi', 'freeli', 'away', 'infiltr', 'daiji', 'stretch', 'dammit', 'lame', 'usa',

In [348]:
from sklearn.feature_extraction.text import TfidfTransformer

### TF TF-IDF
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.2156076  7.2156076  7.2156076  ... 7.90875478 7.2156076  7.90875478]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


<h4> Automatizar o processo de agrupamento para diferentes valores de centroids. </h4>
<ol>
    <li> Criar uma array contendo os diferentes valores de centroides. </li> <br/>
    <li> Para cada quantidade de centroide: <br/> <br/>
        <ol>
            <li> Fazer o agrupamento </li> <br/>
            <li> Realizar a predição para todos os registros </li> <br/>
            <li> Contar quantos registros de cada grupo existem </li> <br/>
            <li> Sortear 10 músicas de cada grupo gerado </li> <br/>
            <li> Salvar em um arquivo: <br/>
                <ul>
                    <li> Quantidade de registros de cada grupo </li> <br/>
                    <li> Nome dos registros sorteados de cada grupo </li> <br/>
                </ul>
            </li>
        </ol>
     </li>
</ol>

In [349]:
from sklearn.cluster import KMeans

centroid = [2,4,8,16]

### GROUPING (KMEANS)
kmeans = KMeans(n_clusters = 2).fit(normalizedResult)
results = kmeans.predict(normalizedResult)

#Fazendo agrupamento com todos os números de centroides
# for c in centroid:
#     kmeans = KMeans(n_clusters = c).fit(normalizedResult)
#     results = kmeans.predict(normalizedResult)
#     print('Centroides: ', kmeans.cluster_centers_)
#     print('Classificações: ', results)


print('Centroides: ', kmeans.cluster_centers_)
print('Classificações: ', results)

In [350]:
## Writing music classification - file name in the txt file
f = open('./kmeans_result_2_centroids.txt', 'w')

st = [''] * 1000

for i in range(0, 1000):
    st[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n'

st.sort()
    
for s in st:
    f.write(s)
    
f.close()

# Contando quantos registros existem em cada grupo
# contents = ""
# count0 = 0
# count1 = 0
# f = open('./kmeans_result_2_centroids.txt', 'r')
# for line in f.readlines():
#     contents += line
#     if '0 - 2 - ' in line:
#         count0 = count0 + 1
#     if '1 - 2 - ' in line:
#         count1 = count1 + 1
# f.close()
# print("Quantidade de músicas com centroide 0 = ",count0)
# print("Quantidade de músicas com centroide 1 = ",count1)