# NLP: Song Lyrics Classification

## Goal

Classify a song lyric to an artist.

In [1]:
import json

In [2]:
import pandas as pd
import numpy as np

In [22]:
from sklearn.model_selection import train_test_split#, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Data

In [4]:
json_files = ["data/lyrics_zero_7.json"
             ,"data/lyrics_credence_clearwater_revival.json"]

In [5]:
lyrics = []
for json_file in json_files:
    with open(json_file, encoding='ascii') as file:
        lyrics.extend(json.load(file))

In [6]:
#print(lyrics[0]['song_title'], '', lyrics[0]['lyrics'][:300]+'...', sep='\n')

In [7]:
df = pd.DataFrame(data=lyrics)
df.drop(['artist_searched'], axis=1, inplace=True)
df.head()

Unnamed: 0,artist_found,song_title,lyrics
0,Zero 7,Witness (One Hope) [Walworth RD Rockers Dub],Well well well \nTaskmaster burst\nThe bionic ...
1,Zero 7,I Have Seen,Old man there people stare\nThinking back to G...
2,Zero 7,Futures [Album Version],Made a note of it\nDid you write it on your ha...
3,Zero 7,Everything Up (Zizou),Mauritania to Wellsdon Green\nTruth is faster ...
4,Zero 7,You're My Flame,You take a stroll into the morning sun\nYou ma...


## Transform: Term Frequency-Inverse Document Frequency (TF-IDF)

In [8]:
corpus = df['lyrics']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [9]:
vectorizer.get_feature_names()[:15]

['15',
 '45s',
 '4x',
 '55',
 '5x',
 '911',
 'aaaah',
 'aah',
 'abbreviate',
 'abide',
 'aboard',
 'about',
 'above',
 'acallin',
 'accident']

In [10]:
X[0].todense()

matrix([[0.03760994, 0.        , 0.        , ..., 0.        , 0.03760994,
         0.03760994]])

In [11]:
df_bow = pd.DataFrame(data=X.todense(), columns=vectorizer.get_feature_names())

In [12]:
df_bow.head()

Unnamed: 0,15,45s,4x,55,5x,911,aaaah,aah,abbreviate,abide,...,young,your,yours,yourself,yow,zhis,zidane,zinedine,zing,zit
0,0.03761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030899,0.012459,0.0,0.0,0.03761,0.0,0.0,0.0,0.03761,0.03761
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.093074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00246,0.0,0.0,0.0,0.0,0.014851,0.014851,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.061782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train/Test split

Define X and y:

* X : Training data
* y : Target values

In [37]:
X = X
y = df['artist_found']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X.shape, X_train.shape, X_test.shape

((124, 2175), (93, 2175), (31, 2175))