In [None]:
# let's import the libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

import pandas as pd
from sklearn.feature_extraction import stop_words
import re
import string

In [None]:
# read the initial data
lyrics_file = pd.read_csv("lyrics.csv")

In [None]:
# initial filters to clean up the data
lyrics_file = lyrics_file[(lyrics_file.genre != "Not Available") 
                          & (lyrics_file.genre != "Other") 
                          & lyrics_file.lyrics.notnull()
                         & lyrics_file.song.notnull()]

In [None]:
# function to tokenize the lyrics
def words(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    words = [w for w in words if w not in stop_words.ENGLISH_STOP_WORDS]
    
    return words

In [None]:
# we split the data in test and train
target_ly = lyrics_file[["genre"]]
feature_ly = lyrics_file[["lyrics"]]

train_data, test_data, train_target, test_target = train_test_split(feature_ly,
                                                                    target_ly)

In [None]:
# we fit a vectorizer
# we create the vectorizer
tfidf = TfidfVectorizer(stop_words='english',
                       tokenizer = words)

# we fit the train data
tf_features = tfidf.fit_transform(train_data.lyrics)

# and we transform the test data
test_features = tfidf.transform(test_data.lyrics)

In [None]:
# now, we fit the model
true_k = 11
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(tf_features)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

### Since it is unsupervised and it doesn't know what it needs to look for, it looks for the most important feature, and it seems like the most important one was language