In [104]:
from pathlib import Path

### READING FILES
p = Path('./2 - English')

allMusicLyrics = []

allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files: ',len(allMusicFiles))

Total of music files:  25693


In [105]:
from random import randint
from operator import itemgetter

## Getting a sample with 1000 musics (randomly)
randomIndexes = []
maxRandomIndex = (len(allMusicFiles)-1)

while len(randomIndexes) < 1000:
    i = randint(0, maxRandomIndex)
    while i in randomIndexes:
        i = randint(0,maxRandomIndex)
    randomIndexes.append(i)

allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Amount of music files:  1000
Amount of lyrics:  1000


In [106]:
## Reading dictionary file with stop words - All of them came from the feature_names bag of words result
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

print('Amount of stopwords: ',len(dictionaryList))

Amount of stopwords:  1305


In [112]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *

### REMOVING STOPWORDS AND DOING STEMMING

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(stop_words=dictionaryList)
vectorizer.fit_transform(allMusicLyrics)

# Getting feature_names
featureNames = vectorizer.get_feature_names()
print("Amount of featurenames before stemming: ", len(featureNames))

# Applying Porter Stemming
stemmer = PorterStemmer() 
stemmedWords = list(set(map(stemmer.stem, featureNames)))
print("Amount of featurenames after stemming: ", len(stemmedWords))

print("Reduced words: ", len(featureNames)-len(stemmedWords))

Amount of featurenames before stemming:  14995
Amount of featurenames after stemming:  11778
Reduced words:  3217


In [108]:
### DOING BAG OF WORDS PROCESS

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(vocabulary=stemmedWords)
vectorizer.fit_transform(allMusicLyrics)

bagOfWordsResult = vectorizer.fit_transform(allMusicLyrics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(bagOfWordsResult.toarray())

['locust', 'medit', 'knuckl', 'stepper', 'axestrik', 'loveli', 'spasmod', 'tsuyogatt', 'winnin', 'tie', 'hajimemash', 'empala', 'ador', 'lui', 'oke', 'dust', 'raibu', 'social', 'worst', 'chrou', 'gown', 'realiti', 'twinki', 'python', 'spiller', 'ie', 'tsuiteta', 'temperatur', 'drone', 'matou', 'warrant', 'rael', 'america', 'niggaz', 'owareteru', 'furikaetta', 'mirand', 'cube', 'mind', 'restor', 'tide', 'sugar', 'keikaku', 'muy', 'pacif', 'youngin', 'rattl', 'doogi', 'parchment', 'met', 'nurs', 'resist', 'dickin', 'umely', 'arigat', 'tabenu', 'caca', 'hitomi', 'independ', 'buyin', 'gari', 'furifuri', 'prodigi', 'narrow', 'bog', 'her', 'oooo', 'dwe', 'buildin', 'jeok', 'bakuhatsu', 'nada', 'malibu', 'musiqu', 'shit', 'charon', 'toke', 'your', 'furugo', 'dont', 'wasteland', 'sympath', 'bimbo', 'devast', 'happili', 'refug', 'rica', 'unpunish', 'muevo', 'vai', 'fuck', 'someday', 'peanut', 'worsen', 'quicker', 'afterworld', 'ked', 'loudest', 'meantim', 'stipe', 'secret', 'motorway', 'sentime

In [109]:
from sklearn.feature_extraction.text import TfidfTransformer

### TF TF-IDF
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.2156076  7.90875478 7.90875478 ... 7.2156076  7.90875478 7.90875478]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [110]:
from sklearn.cluster import KMeans

### GROUPING (KMEANS)
kmeans = KMeans(n_clusters = 4).fit(normalizedResult)
results = kmeans.predict(normalizedResult)

print('Centroides: ', kmeans.cluster_centers_)
print('Classificações: ', results)

Centroides:  [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00025427 0.         0.         ... 0.00048225 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
Classificações:  [2 2 3 0 2 3 1 0 0 3 0 3 1 2 2 0 3 0 2 0 3 3 2 3 3 1 0 0 2 1 2 3 3 3 3 2 0
 0 2 0 2 0 2 2 0 0 2 3 2 1 3 2 3 0 0 3 2 2 0 0 0 3 2 3 2 0 0 0 2 2 2 0 2 3
 2 3 2 2 2 3 2 3 2 3 0 2 0 0 2 0 3 2 3 0 3 0 2 3 0 3 2 2 2 3 3 2 0 2 0 3 2
 0 2 2 2 1 2 0 0 3 2 2 3 0 0 0 2 3 3 3 0 0 2 0 2 1 2 2 2 0 0 3 3 0 1 2 0 2
 3 3 2 2 3 3 2 2 2 0 2 0 3 3 2 2 3 3 1 2 2 1 1 3 3 0 1 3 0 2 0 3 3 2 2 2 0
 2 2 3 2 3 3 0 3 0 2 2 3 3 2 2 2 2 2 0 0 3 3 2 3 0 2 3 2 3 0 0 3 2 2 2 0 3
 0 2 0 0 2 2 2 2 0 3 2 2 2 0 3 1 2 2 3 2 2 0 3 3 2 2 2 0 2 2 2 3 3 3 1 2 3
 2 3 0 3 2 3 2 2 2 0 0 3 2 2 3 3 2 3 0 2 2 0 2 3 0 2 3 0 3 3 3 0 3 2 2 0 2
 3 2 2 2 2 2 2 2 3 2 2 0 3 2 2 0 2 3 3 2 2 0 0 1 0 3 2 2 0 3 2 3 2 3 3 0 2
 0

In [111]:
## Writing music classification - file name in the txt file
f = open('./kmeans_result.txt', 'w')

st = [''] * 1000

for i in range(0, 1000):
    st[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n'

st.sort()
    
for s in st:
    f.write(s)
    
f.close()