In [98]:
# Initial Imports
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy

In [99]:
# Read in lyrics data
lyrics = pd.read_csv("data/songdata.csv")

In [100]:
# Required columns
cols=['song', 'artist', 'trackid', 'danceability', 'energy', 'key', 'mode', 
      'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
      'valence', 'tempo', 'duration_ms', 'time_signature',
     'duration', 'analysis_sample_rate', 'analysis_channels', 
      'end_of_fade_in', 'start_of_fade_out', 'loudness', 'tempo', 'tempo_confidence', 
      'time_signature', 'time_signature_confidence', 'key', 'key_confidence', 'mode', 'mode_confidence']

In [101]:
# Creating dataframe for audio features
features=pd.DataFrame(columns=cols)

In [102]:
artists = ['Eminem', 'Beatles', 'Radiohead', 'Taylor Swift', 'Aerosmith','Ed Sheeran', 'Drake', 'Pink Floyd', 'Ella Fitzgerald', 'Queen', 'Billy Joel', 'Black Sabbath','Green Day' ]

In [107]:
songs = ['Just Lose It', 'Let It Be', 'Norwegian Wood', 'Imagine', 'Creep', 'Karma Police', 'I Knew You Were Trouble', 'Love Story', 'Dream On',  'A Team', 'Sing', 'Best I Ever Had', 'Eclipse', 'Dogs', 'At Last', 'What A Wonderful World', 'Bohemian Rhapsody', 'Uptown Girl', 'Piano Man', 'Electric Funeral', 'Valhalla', 'American Idiot']

In [108]:
len(songs)

22

In [110]:
# Select songs based on the list of songs or a list of artists or both
songs_selected = lyrics[lyrics.song.isin(songs)]

In [111]:
# sample dataframe is used for all analysis here on
sample = songs_selected
sample.reset_index(inplace=True)

In [113]:
# Get audio features for the songs in the sample dataframe
# Client Credentials
client_id = "5fc0da4106eb48e1913868f4950828fb"
client_secret = "5a549bc4674e47dfaf3b06da4500a44d"

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

for index,row in sample.iterrows():
    song = row['song']
    
    temp = pd.DataFrame(columns = cols)
    temp['song'] = [song]
    temp['artist'] = [row['artist']]
    
    try:
        result = sp.search(song, type='track', limit=1)

    except error as e:
        print(e)
        continue
        
    if len(result['tracks']['items']) == 0:
        temp['trackid'] = ['NA']
        features = features.append(temp, ignore_index=True)
        continue
    
    trackid=result['tracks']['items'][0]['id']
    temp['trackid'] = [trackid]
    
    try:
        feat=sp.audio_features(trackid)[0]
        analysis=sp.audio_analysis(trackid)['track']
    except error as e:
        print(e)
        continue
    
    combined = {**feat, **analysis}
    
    for col in cols:
        if col in ['song', 'artist', 'trackid']:
            continue
        temp[col]= combined[col]
        
    features = features.append(temp, ignore_index=True)

In [108]:
# Dataframe containing features for the songs in the df sample
features.to_csv('feautures200.csv', index=None)

In [114]:
features.head()

Unnamed: 0,song,artist,trackid,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,...,start_of_fade_out,loudness,tempo,tempo_confidence,time_signature,time_signature_confidence,key.1,key_confidence,mode.1,mode_confidence
0,Uptown Girl,Billy Joel,5zA8vzDGqPl2AzZkEYQGKh,0.701,0.943,4,1,0.0456,0.0747,0.0,...,182.29406,-2.986,128.992,0.838,4,1.0,4,0.671,1,0.453
1,Electric Funeral,Black Sabbath,7As8h8LJTMIritB8QwSmqr,0.364,0.488,4,1,0.0461,0.434,1.23e-06,...,270.39637,-12.888,125.271,0.534,4,1.0,4,0.429,1,0.467
2,Best I Ever Had,Drake,3fyMH1t6UPeR5croea9PrR,0.427,0.861,1,1,0.376,0.219,0.0,...,245.7658,-4.874,161.776,0.425,4,1.0,1,0.0,1,0.298
3,At Last,Ella Fitzgerald,4Hhv2vrOTy89HFRcjU3QOx,0.171,0.33,5,1,0.0329,0.707,0.00381,...,172.77968,-9.699,174.431,0.157,3,0.937,5,0.35,1,0.451
4,Just Lose It,Eminem,0qcjuYtMWhBjXg0Xwt5SzS,0.949,0.646,5,0,0.0484,0.0705,1.14e-05,...,227.77034,-3.728,121.007,0.875,4,1.0,5,0.318,0,0.215


In [115]:
# Getting word vectors
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer

# Getting all text for training own word vectors
all_text = ' '.join(lyrics.text)
all_text = all_text.replace('\n', '')
sent_text = nltk.sent_tokenize(all_text)

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [116]:
# Training and saving the word vec model
wv_dimension = 20

tokenizer = RegexpTokenizer(r'\w+').tokenize

sent_tokenized = []
for sent in sent_text:
    sent_tokenized.append(tokenizer(sent))
    
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

path = get_tmpfile("word2vec.model")
model = Word2Vec(sent_tokenized, size=wv_dimension, window=5, min_count=3, workers=4)

model.save("word2vec.model")

In [159]:
# Finding wv representation for the lyrics in the dataframe sample
# Creating a dataframe wv with the word vectors
wv = []
for lyr in sample.text:
    vec = [0] * 200
    c = 0
    words = tokenizer(lyr)
    for word in words:
        try:
            vec = vec + model[word]
            c = c + 1
        except:
            vec = vec + [0] * 200
    vec = np.array(vec) / c
    wv.append(vec)

  
  if sys.path[0] == '':


In [160]:
# Naimg the columns in wv to 1-wv_dimension
req_cols = list(range(wv_dimension))
wv_df = pd.DataFrame(wv[:1000])[req_cols]
wv_df['song'] = sample['song']

In [161]:
wv_df.to_csv('word_vectors_songs.csv', index=None)

In [164]:
# Making a copy for preservation
features_copy = features.copy()

In [166]:
# Merging features and wv to get dataframe with audio features and the word vectors
df_all = pd.merge(features, wv_df, how='left', on='song')
# Removing empty rows
df_all = df_all[df_all['trackid'] != 'NA']

In [167]:
# Get a list of column in the consolidated dataframe
cols = list(df_all.columns)

In [169]:
# Remove columns not required for clustering
cols.remove('song')
cols.remove('artist')
cols.remove('trackid')

In [173]:
# Drop rows with NA values
df_all = df_all.dropna()

In [174]:
df_all.shape

(0, 229)

In [None]:
# Scaling the dataframe 
from sklearn import preprocessing

x = df_all[cols].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

# Adding song and artist to the scaled dataframe
df['song'] = df_all['song']
df['artist'] = df_all['artist']

In [None]:
# Removing unnecessary columns from the list of columns for clustering
dfcols = list(df.columns)
dfcols.remove('song')
dfcols.remove('artist')

In [None]:
# Perform Kmeans on the consolidated scaled dataset
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0).fit(df[dfcols])
df['label'] = kmeans.labels_

In [None]:
df.label.unique()

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df[dfcols])

df.label=cluster.labels_

In [None]:
df[df.label==0]

In [144]:
# Getting sentiment for the sample
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_data = pd.DataFrame(columns=['positive', 'negative', 'neutral', 'compound', 'song'])
for index, row in sample.iterrows():
    pos = []
    neg = []
    neu = []
    compound = []
    sentences=nltk.sent_tokenize(row.text)
    analyzer = SentimentIntensityAnalyzer()
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        pos.append(vs['pos'])
        neg.append(vs['neg'])
        neu.append(vs['neu'])
        compound.append(vs['compound'])
    tmp = pd.DataFrame([[np.average(pos), np.average(neg), np.average(neu), np.average(compound), row['song']]], columns=['positive', 'negative', 'neutral', 'compound', 'song'])
    sentiment_data = pd.concat([sentiment_data, tmp], ignore_index=True)

In [145]:
# Merge the dataset to add sentiment calues
df_with_sentiment = pd.merge(df_all, sentiment_data, how='left', on='song') 

In [148]:
# Scaling the dataframe
x = df_with_sentiment[dfcols + ['positive', 'negative', 'neutral', 'compound']].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_with_sentiment = pd.DataFrame(x_scaled, columns=dfcols + ['positive', 'negative', 'neutral', 'compound'])
df_with_sentiment['song'] = df_all['song']
df_with_sentiment['artist'] = df_all['artist']

In [150]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df_with_sentiment[dfcols + ['positive', 'negative', 'neutral', 'compound']])

df_with_sentiment.label=cluster.labels_

  


In [157]:
df[df.label==0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,song,artist,label
6,0.741645,0.524752,0.6,0.6,1.0,1.0,0.064857,0.316806,1.0,0.103943,...,1.0,0.6,0.6,0.70063,1.0,1.0,0.397019,Dogs,Pink Floyd,0
8,0.442159,0.199434,0.7,0.7,1.0,1.0,0.031143,0.01439,0.00266,0.154122,...,1.0,0.7,0.7,1.0,1.0,1.0,1.0,Creep,Radiohead,0
9,0.546272,0.260255,0.6,0.6,1.0,1.0,0.037143,0.007926,2.4e-05,0.0,...,1.0,0.6,0.6,0.305672,1.0,1.0,0.296748,I Knew You Were Trouble,Taylor Swift,0
10,0.642674,0.463932,1.0,1.0,1.0,1.0,0.086,0.567169,0.0,0.136201,...,0.995859,1.0,1.0,0.679622,1.0,1.0,0.661247,Love Story,Taylor Swift,0
13,0.412596,1.0,0.6,0.6,1.0,1.0,0.454286,8.5e-05,0.0,0.709677,...,0.0,0.6,0.6,0.681723,1.0,1.0,0.514905,Valhalla,Black Sabbath,0
14,0.605398,0.0,0.9,0.9,1.0,1.0,0.030571,0.94625,0.0,0.24552,...,1.0,0.9,0.9,0.682773,1.0,1.0,0.754743,A Team,Ed Sheeran,0
15,0.861183,0.620934,0.7,0.7,1.0,1.0,0.064,0.119769,0.0,0.189964,...,0.857143,0.7,0.7,0.067227,1.0,1.0,0.254743,Sing,Ed Sheeran,0
17,0.242931,0.305516,0.7,0.7,1.0,1.0,0.0,0.088509,0.00174,0.231183,...,0.635611,0.7,0.7,0.597689,1.0,1.0,0.864499,Karma Police,Radiohead,0
