In [3]:
# Initial Imports
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy

In [108]:
# Read in lyrics data
lyrics = pd.read_csv("data/songdata.csv")

In [1]:
# Required columns
cols=['song', 'artist', 'trackid', 'danceability', 'energy', 'key', 'mode', 
      'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
      'valence', 'tempo', 'duration_ms', 'time_signature',
     'duration', 'analysis_sample_rate', 'analysis_channels', 
      'end_of_fade_in', 'start_of_fade_out', 'loudness', 'tempo', 'tempo_confidence', 
      'time_signature', 'time_signature_confidence', 'key', 'key_confidence', 'mode', 'mode_confidence']

In [4]:
cols = list(np.unique(cols))

In [5]:
# Creating dataframe for audio features
features=pd.DataFrame(columns=cols)

In [111]:
artists = ['Eminem', 'Passenger', 'Bruno Mars', 'Coldplay', 'Ozzy Osbourne', 'Slayer','Metallica','Beatles','Dire Straits' ,'Radiohead', 'Taylor Swift', 'Aerosmith','Ed Sheeran', 'Drake', 'Pink Floyd', 'Queen', 'Billy Joel', 'Iron Maiden', 'Black Sabbath','Green Day' ]

In [112]:
songs = ['Let It Be', 'Let Her Go','Hey Jude', '', 'Marry You','Yellow', 'The Lazy Song','Norwegian Wood', 'Wonderful Tonight',  'Imagine', 'I Knew You Were Trouble', 'Love Story', 'A Team', 'Sing', 'Bohemian Rhapsody', 'Electric Funeral', 'Valhalla', 'American Idiot', 'Angel Of Death', 'War Pigs', 'The Trooper']

In [114]:
# Filtering on artist
lyrics = lyrics[lyrics.artist.isin(artists)]
# Select songs based on the list of songs or a list of artists or both
songs_selected = lyrics[lyrics.song.isin(songs)]

In [115]:
songs_selected.shape

(16, 4)

In [116]:
# sample dataframe is used for all analysis here on
sample = songs_selected
sample.reset_index(inplace=True)

In [166]:
list(sample['song'])

['Electric Funeral',
 'Marry You',
 'The Lazy Song',
 'Yellow',
 'Wonderful Tonight',
 'American Idiot',
 'War Pigs',
 'Let Her Go',
 'Angel Of Death',
 'I Knew You Were Trouble',
 'Love Story',
 'Valhalla',
 'A Team',
 'Sing',
 'The Trooper',
 'Bohemian Rhapsody']

In [6]:
# Get audio features for the songs in the sample dataframe
# Client Credentials
client_id = "5fc0da4106eb48e1913868f4950828fb"
client_secret = "5a549bc4674e47dfaf3b06da4500a44d"

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

for index,row in sample.iterrows():
    song = row['song']
    
    temp = pd.DataFrame(columns = cols)
    temp['song'] = [song]
    temp['artist'] = [row['artist']]
    
    try:
        result = sp.search(song, type='track', limit=1)

    except:
        print('SOme error')
        continue
        
    if len(result['tracks']['items']) == 0:
        continue
    
    trackid=result['tracks']['items'][0]['id']
    temp['trackid'] = [trackid]
    
    try:
        feat=sp.audio_features(trackid)[0]
        analysis=sp.audio_analysis(trackid)['track']
    except error as e:
        print(e)
        continue
    
    combined = {**feat, **analysis}
    
    for col in cols:
        if col in ['song', 'artist', 'trackid']:
            continue
        temp[col]= combined[col]
        
    features = features.append(temp, ignore_index=True)

NameError: name 'sample' is not defined

In [119]:
features = features.T.drop_duplicates().T

In [120]:
# Getting word vectors
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer

# Getting all text for training own word vectors
all_text = ' '.join(lyrics.text)
all_text = all_text.replace('\n', '')
sent_text = nltk.sent_tokenize(all_text)

[nltk_data] Downloading package punkt to /home/aman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [121]:
# Training and saving the word vec model
wv_dimension = 20

tokenizer = RegexpTokenizer(r'\w+').tokenize

sent_tokenized = []
for sent in sent_text:
    sent_tokenized.append(tokenizer(sent))
    
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

path = get_tmpfile("word2vec.model")
model = Word2Vec(sent_tokenized, size=wv_dimension, window=5, min_count=3, workers=4)

model.save("word2vec.model")

In [122]:
# Finding wv representation for the lyrics in the dataframe sample
# Creating a dataframe wv with the word vectors
wv = []
for lyr in sample.text:
    vec = [0] * wv_dimension
    c = 0
    words = tokenizer(lyr)
    for word in words:
        try:
            vec = vec + model[word]
            c = c + 1
        except:
            vec = vec + [0] * wv_dimension
    vec = np.array(vec) / c
    wv.append(vec)

  # Remove the CWD from sys.path while we load stuff.
  


In [123]:
# Naimg the columns in wv to 1-wv_dimension
req_cols = list(range(wv_dimension))
wv_df = pd.DataFrame(wv)[req_cols]
wv_df['song'] = sample['song']

In [124]:
# wv_df.to_csv('word_vectors_songs.csv', index=None)

In [125]:
# Making a copy for preservation
features_copy = features.copy()

In [126]:
# Merging features and wv to get dataframe with audio features and the word vectors
df_all = pd.merge(features, wv_df, how='left', on='song')
# Removing empty rows
df_all = df_all[df_all['trackid'] != 'NA']

In [127]:
# Drop rows with NA values
df_all = df_all.dropna()

In [128]:
df_all.reset_index(inplace=True, drop=True)

In [129]:
# Get a list of column in the consolidated dataframe
cols = list(df_all.columns)

In [130]:
# Remove columns not required for clustering
cols.remove('song')
cols.remove('artist')
cols.remove('trackid')

In [131]:
df_all.shape

(14, 45)

In [132]:
# Scaling the dataframe 
from sklearn import preprocessing

x = df_all[cols].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

# Adding song and artist to the scaled dataframe
df.columns = cols
df['song'] = df_all['song']
df['artist'] = df_all['artist']



In [133]:
# Removing unnecessary columns from the list of columns for clustering
dfcols = list(df.columns)
dfcols.remove('song')
dfcols.remove('artist')

In [157]:
# Perform Kmeans on the consolidated scaled dataset
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(df[dfcols])
df['label'] = kmeans.labels_

In [158]:
df.label.unique()

array([0, 2, 1])

In [159]:
df[df.label==1]

Unnamed: 0,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,13,14,15,16,17,18,19,song,artist,label
4,0.144201,0.98977,0.0,1.0,0.222222,0.0,0.000609,0.65261,0.793412,1.0,...,0.147781,0.750268,0.0,0.891171,0.693891,0.27898,0.424434,American Idiot,Green Day,1
5,0.068966,0.523018,0.8,0.0,0.327553,0.331812,0.006845,0.471888,0.38827,0.968683,...,0.269291,0.730286,0.321858,0.0,0.801659,0.373867,0.20775,War Pigs,Ozzy Osbourne,1
9,0.319749,1.0,0.5,1.0,1.0,9e-05,0.0,0.795181,0.0,0.450484,...,0.445135,1.0,0.01683,0.508237,1.0,0.807716,0.0,Valhalla,Black Sabbath,1
10,0.554859,0.095908,0.8,1.0,0.047527,1.0,0.0,0.2751,0.377941,0.090805,...,0.422126,0.917711,0.0761,0.68676,0.68327,0.650661,0.368161,A Team,Ed Sheeran,1
12,0.0,0.831202,0.3,0.0,0.244059,0.025672,0.439535,1.0,0.623551,0.785283,...,0.551,0.850327,0.267092,0.824507,0.866925,0.763042,0.339123,The Trooper,Iron Maiden,1


In [147]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df[dfcols])

df.label=cluster.labels_

In [150]:
df[df.label==1]

Unnamed: 0,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,13,14,15,16,17,18,19,song,artist,label
4,0.144201,0.98977,0.0,1.0,0.222222,0.0,0.000609,0.65261,0.793412,1.0,...,0.147781,0.750268,0.0,0.891171,0.693891,0.27898,0.424434,American Idiot,Green Day,1
5,0.068966,0.523018,0.8,0.0,0.327553,0.331812,0.006845,0.471888,0.38827,0.968683,...,0.269291,0.730286,0.321858,0.0,0.801659,0.373867,0.20775,War Pigs,Ozzy Osbourne,1
9,0.319749,1.0,0.5,1.0,1.0,9e-05,0.0,0.795181,0.0,0.450484,...,0.445135,1.0,0.01683,0.508237,1.0,0.807716,0.0,Valhalla,Black Sabbath,1
12,0.0,0.831202,0.3,0.0,0.244059,0.025672,0.439535,1.0,0.623551,0.785283,...,0.551,0.850327,0.267092,0.824507,0.866925,0.763042,0.339123,The Trooper,Iron Maiden,1


In [139]:
# Getting sentiment for the sample
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_data = pd.DataFrame(columns=['positive', 'negative', 'neutral', 'compound', 'song'])
for index, row in sample.iterrows():
    pos = []
    neg = []
    neu = []
    compound = []
    sentences=nltk.sent_tokenize(row.text)
    analyzer = SentimentIntensityAnalyzer()
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        pos.append(vs['pos'])
        neg.append(vs['neg'])
        neu.append(vs['neu'])
        compound.append(vs['compound'])
    tmp = pd.DataFrame([[np.average(pos), np.average(neg), np.average(neu), np.average(compound), row['song']]], columns=['positive', 'negative', 'neutral', 'compound', 'song'])
    sentiment_data = pd.concat([sentiment_data, tmp], ignore_index=True)

In [140]:
# Merge the dataset to add sentiment calues
df_with_sentiment = pd.merge(df_all, sentiment_data, how='left', on='song')

In [141]:
# Scaling the dataframe
x = df_with_sentiment[dfcols + ['positive', 'negative', 'neutral', 'compound']].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_with_sentiment = pd.DataFrame(x_scaled, columns=dfcols + ['positive', 'negative', 'neutral', 'compound'])
df_with_sentiment['song'] = df_all['song']
df_with_sentiment['artist'] = df_all['artist']



In [142]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df_with_sentiment[dfcols + ['positive', 'negative', 'neutral', 'compound']])

df_with_sentiment.label=cluster.labels_

  


In [175]:
list(df_with_sentiment[df_with_sentiment.label==2]['song'])

['Wonderful Tonight', 'Let Her Go']

In [160]:
# Perform Kmeans on the consolidated scaled dataset
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_with_sentiment[dfcols + ['positive', 'negative', 'neutral', 'compound']])
df_with_sentiment['label'] = kmeans.labels_

In [164]:
df_with_sentiment[df_with_sentiment.label==0]

Unnamed: 0,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,17,18,19,positive,negative,neutral,compound,song,artist,label
4,0.144201,0.98977,0.0,1.0,0.222222,0.0,0.000609,0.65261,0.793412,1.0,...,0.693891,0.27898,0.424434,0.328872,0.534462,0.24972,0.460676,American Idiot,Green Day,1
5,0.068966,0.523018,0.8,0.0,0.327553,0.331812,0.006845,0.471888,0.38827,0.968683,...,0.801659,0.373867,0.20775,0.055762,0.593846,0.464995,0.008112,War Pigs,Ozzy Osbourne,1
9,0.319749,1.0,0.5,1.0,1.0,9e-05,0.0,0.795181,0.0,0.450484,...,1.0,0.807716,0.0,0.579926,0.436923,0.107307,0.776927,Valhalla,Black Sabbath,1
10,0.554859,0.095908,0.8,1.0,0.047527,1.0,0.0,0.2751,0.377941,0.090805,...,0.68327,0.650661,0.368161,0.449814,0.572308,0.071538,0.038055,A Team,Ed Sheeran,1
12,0.0,0.831202,0.3,0.0,0.244059,0.025672,0.439535,1.0,0.623551,0.785283,...,0.866925,0.763042,0.339123,0.204461,0.612308,0.282177,0.008963,The Trooper,Iron Maiden,1


In [170]:
import random

def some(x, n):
    return x.ix[random.sample(x.index, n)]

In [171]:
some(sample, 7)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


TypeError: Population must be a sequence or set.  For dicts, use list(d).

In [176]:
list(sample.sample(n=2)['song'])

['Electric Funeral', 'Sing']

In [None]:
'Wonderful Tonight', 'Electric Funeral', 'I Knew You Were Trouble', 'Love Story', 'American Idiot', 'Marry You', 'The Lazy Song'