In [1]:
from random import randint

## Function to get a list of random values between a interval

def randomValues (begin, end, length, allowedToRepeat = False):
    if end < begin: 
        return []
    
    values = []
    if allowedToRepeat:
        while (len(values) < length):
            values.append(randint(begin,end))
    else:
        if (end-begin+1) >= length:
            while (len(values) < length):
                i = randint(begin, end)
                while i in values:
                    i = randint(begin,end)
                values.append(i)
    
    return values
            

In [2]:
from pathlib import Path

### READING FILES
p = Path('./2 - English')

allMusicLyrics = []

allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files: ',len(allMusicFiles))

Total of music files:  25693


In [3]:
from random import randint
from operator import itemgetter

## Getting a sample with 1000 musics (randomly)
randomIndexes = randomValues(0,len(allMusicFiles)-1,1000)

allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Amount of music files:  1000
Amount of lyrics:  1000


In [4]:
## Reading dictionary file with stop words - All of them came from the feature_names bag of words result
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

print('Amount of stopwords: ',len(dictionaryList))

Amount of stopwords:  1342


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *

### REMOVING STOPWORDS AND DOING STEMMING

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(stop_words=dictionaryList)
vectorizer.fit_transform(allMusicLyrics)

# Getting feature_names
featureNames = vectorizer.get_feature_names()
print("Amount of featurenames before stemming: ", len(featureNames))

# Applying Porter Stemming
stemmer = PorterStemmer() 
stemmedWords = list(set(map(stemmer.stem, featureNames)))
print("Amount of featurenames after stemming: ", len(stemmedWords))

print("Reduced words: ", len(featureNames)-len(stemmedWords))

Amount of featurenames before stemming:  13970
Amount of featurenames after stemming:  10930
Reduced words:  3040


In [6]:
### DOING BAG OF WORDS PROCESS

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(vocabulary=stemmedWords)
vectorizer.fit_transform(allMusicLyrics)

bagOfWordsResult = vectorizer.fit_transform(allMusicLyrics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(bagOfWordsResult.toarray())

['looosse', 'shoppin', 'hicki', 'encod', 'moter', 'tricio', 'nowwhen', 'lyre', 'term', 'singl', 'excel', 'healthier', 'afar', 'rubbin', 'guarda', 'mend', 'tower', 'independ', 'oohh', 'defend', 'dish', 'enkindl', 'jan', 'crimin', 'exot', 'gravi', 'fate', 'stink', 'sea', 'id', 'camera', 'concernin', 'prowl', 'desmond', 'meomchwosseo', 'dillo', 'della', 'miron', 'custom', 'mount', 'ruger', 'hood', 'reciev', 'championship', 'nectar', 'salvag', 'black', 'mollatdeon', 'muneul', 'ladda', 'eui', 'ouça', 'namgin', 'discoveri', 'backstrok', 'pitch', 'saviour', 'sake', 'feen', 'nneun', 'plesir', 'sixteen', 'petey', 'tameiki', 'herb', 'plenti', 'drought', 'rebound', 'konichiwa', 'strap', 'stylin', 'emphas', 'sweeti', 'bucket', 'even', 'instantli', 'guy', 'mmmmm', 'dekiru', 'bedroom', 'difficult', 'everyboy', 'marit', 'ana', 'jam', 'pratum', 'tinggalkan', 'wooo', 'noth', 'chopper', 'balloon', 'patrol', 'unfair', 'mention', 'tatakattekita', 'whi', 'ugh', 'badli', 'planta', 'speakerbox', 'fai', 'web'

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

### TF TF-IDF
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.90875478 6.81014249 7.90875478 ... 6.81014249 7.2156076  7.2156076 ]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


<h4> Automatizar o processo de agrupamento para diferentes valores de centroids. </h4>
<ol>
    <li> Criar uma array contendo os diferentes valores de centroides. </li> <br/>
    <li> Para cada quantidade de centroide: <br/> <br/>
        <ol>
            <li> Fazer o agrupamento </li> <br/>
            <li> Realizar a predição para todos os registros </li> <br/>
            <li> Contar quantos registros de cada grupo existem </li> <br/>
            <li> Sortear 10 músicas de cada grupo gerado </li> <br/>
            <li> Salvar em um arquivo: <br/>
                <ul>
                    <li> Quantidade de registros de cada grupo </li> <br/>
                    <li> Nome dos registros sorteados de cada grupo </li> <br/>
                </ul>
            </li>
        </ol>
     </li>
</ol>

In [17]:
from sklearn.cluster import KMeans
import collections, numpy

centroid = [2,4,8,16]

### GROUPING (KMEANS)
kmeans = KMeans(n_clusters = 2).fit(normalizedResult)
results = kmeans.predict(normalizedResult)

#Fazendo agrupamento com todos os números de centroides
# for c in centroid:
#     kmeans = KMeans(n_clusters = c).fit(normalizedResult)
#     results = kmeans.predict(normalizedResult)
#     print('Centroides: ', kmeans.cluster_centers_)
#     print('Classificações: ', results)


print('Centroides: ', kmeans.cluster_centers_)
print('Classificações: ', results)

for k, v in collections.Counter(results).items():
    print(k,' - ', v)

Centroides:  [[0.00000000e+00 6.38891370e-05 0.00000000e+00 ... 6.94939801e-04
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.39224176e-04 0.00000000e+00 ... 0.00000000e+00
  1.45523948e-04 1.83306939e-04]]
Classificações:  [0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 0 0 0 1 1
 0 1 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 1
 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 1 0 1 0
 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0
 1 1 0 1 0 0 1 0 1 1 0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 0 0
 1 0 1 1 0 0 0 1 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1
 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 0 0 0 0
 0 0 0 1 0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1
 0 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 0 0 1 1
 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0
 0 1 1 1 0 0 0 0 1 1 

In [9]:
## Writing music classification - file name in the txt file
f = open('./kmeans_result_2_centroids.txt', 'w')

st = [''] * 1000

for i in range(0, 1000):
    st[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n'

st.sort()
    
for s in st:
    f.write(s)
    
f.close()

# Contando quantos registros existem em cada grupo
# contents = ""
# count0 = 0
# count1 = 0
# f = open('./kmeans_result_2_centroids.txt', 'r')
# for line in f.readlines():
#     contents += line
#     if '0 - 2 - ' in line:
#         count0 = count0 + 1
#     if '1 - 2 - ' in line:
#         count1 = count1 + 1
# f.close()
# print("Quantidade de músicas com centroide 0 = ",count0)
# print("Quantidade de músicas com centroide 1 = ",count1)