In [67]:
from random import randint

## Function to get a list of random values between a interval

def randomValues (begin, end, length, allowedToRepeat = False):
    if end < begin: 
        return []
    
    values = []
    if allowedToRepeat:
        while (len(values) < length):
            values.append(randint(begin,end))
    else:
        if (end-begin+1) >= length:
            while (len(values) < length):
                i = randint(begin, end)
                while i in values:
                    i = randint(begin,end)
                values.append(i)
    
    return values
            

In [68]:
from pathlib import Path

### READING FILES
p = Path('./2 - English')

allMusicLyrics = []

allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files: ',len(allMusicFiles))

Total of music files:  25693


In [69]:
from random import randint
from operator import itemgetter

## Getting a sample with 1000 musics (randomly)
randomIndexes = randomValues(0,len(allMusicFiles)-1,1000)

allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Amount of music files:  1000
Amount of lyrics:  1000


In [70]:
## Reading dictionary file with stop words - All of them came from the feature_names bag of words result
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

print('Amount of stopwords: ',len(dictionaryList))

Amount of stopwords:  1342


In [71]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *

### REMOVING STOPWORDS AND DOING STEMMING

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(stop_words=dictionaryList)
vectorizer.fit_transform(allMusicLyrics)

# Getting feature_names
featureNames = vectorizer.get_feature_names()
print("Amount of featurenames before stemming: ", len(featureNames))

# Applying Porter Stemming
stemmer = PorterStemmer() 
stemmedWords = list(set(map(stemmer.stem, featureNames)))
print("Amount of featurenames after stemming: ", len(stemmedWords))

print("Reduced words: ", len(featureNames)-len(stemmedWords))

Amount of featurenames before stemming:  14868
Amount of featurenames after stemming:  11644
Reduced words:  3224


In [72]:
### DOING BAG OF WORDS PROCESS

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(vocabulary=stemmedWords)
vectorizer.fit_transform(allMusicLyrics)

bagOfWordsResult = vectorizer.fit_transform(allMusicLyrics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(bagOfWordsResult.toarray())

['ureodo', 'hola', 'kookoo', 'normal', 'tiam', 'empir', 'gaseou', 'yakkiti', 'keyboard', 'digniti', 'stagnant', 'silkblack', 'militari', 'plu', 'stil', 'haymak', 'kein', 'skybolt', 'epic', 'warikir', 'bulleo', 'critter', 'quitter', 'ingana', 'conserv', 'jeoreon', 'rodent', 'brilhando', 'blous', 'scent', 'channel', 'cad', 'conciou', 'cal', 'monkii', 'patrol', 'crackin', 'denni', 'mianha', 'alarma', 'demis', 'hanaman', 'hontou', 'enabl', 'podré', 'maldad', 'boot', 'anway', 'advantag', 'tableau', 'antoin', 'raven', 'custom', 'benefit', 'dunn', 'problem', 'ficar', 'discret', 'dominican', 'gemini', 'size', 'extrem', 'veteran', 'vs', 'llame', 'montagu', 'amnesti', 'lorenzo', 'buzzin', 'ondoga', 'bobbi', 'cird', 'glock', 'backlund', 'ijeneun', 'drip', 'itchey', 'messiah', 'hitin', 'closer', 'termin', 'soljikhi', 'pajama', 'mango', 'abraham', 'cabbi', 'mud', 'kkeuteomneun', 'jigglin', 'juxtapoz', 'mile', 'sulphur', 'reaper', 'x7', 'atsumet', 'shore', 'trampo', 'mourner', 'unfortun', 'moder', '

In [73]:
from sklearn.feature_extraction.text import TfidfTransformer

### TF TF-IDF
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.2156076  7.2156076  7.2156076  ... 7.2156076  6.11699531 7.2156076 ]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


<h4> Automatizar o processo de agrupamento para diferentes valores de centroids. </h4>
<ol>
    <li> Criar uma array contendo os diferentes valores de centroides. </li> <br/>
    <li> Para cada quantidade de centroide: <br/> <br/>
        <ol>
            <li> Fazer o agrupamento </li> <br/>
            <li> Realizar a predição para todos os registros </li> <br/>
            <li> Contar quantos registros de cada grupo existem </li> <br/>
            <li> Sortear 10 músicas de cada grupo gerado </li> <br/>
            <li> Salvar em um arquivo: <br/>
                <ul>
                    <li> Quantidade de registros de cada grupo </li> <br/>
                    <li> Nome dos registros sorteados de cada grupo </li> <br/>
                </ul>
            </li>
        </ol>
     </li>
</ol>

In [74]:
from sklearn.cluster import KMeans

centroid = [2,4,8,16]

### GROUPING (KMEANS)
kmeans = KMeans(n_clusters = 4).fit(normalizedResult)
results = kmeans.predict(normalizedResult)

print('Centroides: ', kmeans.cluster_centers_)
print('Classificações: ', results)

Centroides:  [[0.         0.         0.0004865  ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.00206678 0.        ]
 [0.         0.         0.         ... 0.00016022 0.00047427 0.        ]
 [0.00017665 0.00056609 0.         ... 0.         0.0009491  0.00062813]]
Classificações:  [1 3 2 3 1 3 3 3 3 2 3 3 1 2 3 0 0 2 0 0 2 1 2 2 3 3 1 3 2 1 0 3 3 3 1 0 3
 2 3 1 2 2 1 3 2 2 2 3 2 1 1 3 3 1 2 3 3 2 2 2 0 2 3 2 3 3 2 2 3 3 3 3 1 3
 3 3 2 2 0 3 1 3 0 3 2 3 0 1 0 0 3 3 3 3 3 0 3 0 3 2 2 1 3 2 3 2 1 0 2 3 1
 1 0 1 2 2 3 1 3 3 3 2 2 1 0 2 2 1 2 2 2 1 0 1 1 3 2 1 2 2 3 2 3 2 3 3 2 3
 2 2 1 3 2 3 1 1 2 1 2 2 3 1 3 0 3 2 3 3 2 3 1 3 1 2 1 3 2 3 2 3 2 2 3 3 2
 1 1 2 0 3 2 1 3 2 0 1 2 1 2 0 3 2 1 3 1 2 2 3 0 0 1 0 2 2 3 3 1 1 0 2 3 2
 1 3 2 0 2 1 2 0 2 1 3 3 2 2 1 1 3 0 2 3 3 3 1 3 3 1 3 3 3 1 3 1 2 3 3 2 3
 2 3 3 3 1 2 1 2 1 2 1 1 2 2 2 1 1 1 2 2 1 0 3 3 1 2 0 3 2 2 2 2 3 3 2 2 2
 1 2 3 2 2 2 2 2 1 0 1 1 3 3 1 1 2 2 3 1 1 0 1 0 3 3 2 1 2 3 2 3 1 2 1 0 3
 1

In [76]:
## Writing music classification - file name in the txt file
f = open('./kmeans_result_4_centroids.txt', 'w')

st = [''] * 1000

for i in range(0, 1000):
    st[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n'

st.sort()
    
for s in st:
    f.write(s)
    
f.close()