In [35]:
from pathlib import Path

### READING FILES
p = Path('./2 - English')

allMusicLyrics = []

allMusicFiles = list(p.glob('**/*.txt'))

print('Total of music files: ',len(allMusicFiles))

Total of music files:  25693


In [36]:
from random import randint
from operator import itemgetter

## Getting a sample with 1000 musics (randomly)
randomIndexes = []
maxRandomIndex = (len(allMusicFiles)-1)

while len(randomIndexes) < 1000:
    i = randint(0, maxRandomIndex)
    while i in randomIndexes:
        i = randint(0,maxRandomIndex)
    randomIndexes.append(i)

allMusicFiles = itemgetter(*randomIndexes)(allMusicFiles)

print('Amount of music files: ', len(allMusicFiles))

for file in allMusicFiles:
    with file.open() as f:
        allMusicLyrics.append(f.read().replace('\n', ' '))
        f.close()
        
print('Amount of lyrics: ', len(allMusicLyrics))

Amount of music files:  1000
Amount of lyrics:  1000


In [37]:
## Reading dictionary file with stop words - All of them came from the feature_names bag of words result
dictionaryFile = open('dictionary.txt','r')
dictionaryContent = dictionaryFile.read()
dictionaryFile.close()
dictionaryList = dictionaryContent.replace('\'','').split(',')

print('Amount of stopwords: ',len(dictionaryList))

Amount of stopwords:  1342


In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *

### REMOVING STOPWORDS AND DOING STEMMING

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(stop_words=dictionaryList)
vectorizer.fit_transform(allMusicLyrics)

# Getting feature_names
featureNames = vectorizer.get_feature_names()
print("Amount of featurenames before stemming: ", len(featureNames))

# Applying Porter Stemming
stemmer = PorterStemmer() 
stemmedWords = list(set(map(stemmer.stem, featureNames)))
print("Amount of featurenames after stemming: ", len(stemmedWords))

print("Reduced words: ", len(featureNames)-len(stemmedWords))

Amount of featurenames before stemming:  15927
Amount of featurenames after stemming:  12593
Reduced words:  3334


In [39]:
### DOING BAG OF WORDS PROCESS

# CountVectorizer removing stop_words
vectorizer = CountVectorizer(vocabulary=stemmedWords)
vectorizer.fit_transform(allMusicLyrics)

bagOfWordsResult = vectorizer.fit_transform(allMusicLyrics)

print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(bagOfWordsResult.toarray())

['graphic', 'lifer', 'sever', 'meltdown', 'oshietekureta', 'investig', 'puffi', 'yatsu', 'hygien', 'forev', 'glitz', 'strummin', 'egocentr', 'hontou', 'wetbomb', 'seventeen', 'millitain', 'york', 'lief', 'runway', 'bridg', 'guitar', 'cartoon', 'scottish', 'unemploy', 'mediterainian', 'cartita', 'kumul', 'pari', 'inhal', 'kasanaru', 'smelt', 'jashin', 'roachclip', 'jock', 'ura', 'seersuck', 'selautan', 'papal', 'fremdel', 'luther', 'blow', 'blindfold', 'tellin', 'hat', 'prosecutor', 'warm', 'racer', 'dim', 'boredom', 'done', 'sashichai', 'measurin', 'ought', 'geund', 'sueño', 'scatter', 'armageddon', 'altitud', 'boomarang', 'steep', 'redwood', 'coedin', 'hajimari', 'glo', 'cone', 'swig', 'dane', 'richard', 'system', 'terlempar', 'canâ', 'une', 'bih', 'ze', 'suteppu', 'thick', 'termani', 'tick', 'ender', 'jukebox', 'upsettor', 'lent', 'said', 'unseal', 'itali', 'midaret', 'grave', 'clyde', 'bi', 'ree', 'wifey', 'bowl', 'teach', 'chewin', 'handshak', 'kyoushoku', 'intro', 'maleun', 'best'

In [40]:
from sklearn.feature_extraction.text import TfidfTransformer

### TF TF-IDF
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(bagOfWordsResult)
normalizedResult = tfidf.toarray()

print(transformer.idf_) ### Feature weitghts
print(tfidf.toarray()) ### Normalized values

[7.2156076  6.81014249 6.52246042 ... 6.52246042 7.2156076  3.07247287]
[[0.         0.         0.         ... 0.         0.         0.07670255]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.00668134]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
from sklearn.cluster import KMeans

### GROUPING (KMEANS)
kmeans = KMeans(n_clusters = 4).fit(normalizedResult)
results = kmeans.predict(normalizedResult)

print('Centroides: ', kmeans.cluster_centers_)
print('Classificações: ', results)

In [None]:
## Writing music classification - file name in the txt file
f = open('./kmeans_result_4_centroids.txt', 'w')

st = [''] * 1000

for i in range(0, 1000):
    st[i] = str(results[i])+' - '+str(allMusicFiles[i])+'\n'

st.sort()
    
for s in st:
    f.write(s)
    
f.close()