### Subproject 1: creating a radar graph to compare playlists based on audio features

In [1]:
# STEP 1: get playlist features

import spotipy as sp
import numpy
from spotipy.oauth2 import SpotifyClientCredentials

# the setup
# –––––––––––––––––––––––––––––––––––––––––––––––––––––
# this part is for you to fill in with your information
username= 'YOUR-SPOTIFY-USERNAME'
client_id = 'YOUR-CLIENT-ID-HERE'
client_secret = 'YOUR-CLIENT-SECRET-HERE'
#––––––––––––––––––––––––––––––––––––––––––––––––––––––
redirecturi='http://localhost:5000'
thescope='playlist-read-private'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
plists = {}

def get_features_for_playlist(uri):
    playlist_id = uri.split(':')[2]
    results = sp.user_playlist(username, playlist_id)
    
    #print(json.dumps(results, indent=4))               # uncomment this if you want to visualize the JSON structure
    
    # initialize the dictionary
    playlist_name = results['name']
    plists[playlist_name] = {}
    plists[playlist_name]['name'] = []
    plists[playlist_name]['track uri'] = []
    plists[playlist_name]['acousticness'] = []
    plists[playlist_name]['danceability'] = []
    plists[playlist_name]['energy'] = []
    plists[playlist_name]['instrumentalness'] = []
    plists[playlist_name]['liveness'] = []
    plists[playlist_name]['loudness'] = []
    plists[playlist_name]['speechiness'] = []
    plists[playlist_name]['tempo'] = []
    plists[playlist_name]['valence'] = []
    plists[playlist_name]['popularity'] = []

    for track in results['tracks']['items']:
        # print(json.dumps(track, indent=4))              # DEBUG STATEMENT
        
        # save metadata stuff
        name = track['track']['name']
        print(name)
        track_uri = track['track']['uri']
        plists[playlist_name]['name'].append(name)
        plists[playlist_name]['track uri'].append(track_uri)

        # extract features
        features = sp.audio_features(track_uri)
        plists[playlist_name]['acousticness'].append(features[0]['acousticness'])
        plists[playlist_name]['danceability'].append(features[0]['danceability'])
        plists[playlist_name]['energy'].append(features[0]['energy'])
        plists[playlist_name]['instrumentalness'].append(features[0]['instrumentalness'])
        plists[playlist_name]['liveness'].append(features[0]['liveness'])
        plists[playlist_name]['loudness'].append(features[0]['loudness'])
        plists[playlist_name]['speechiness'].append(features[0]['speechiness'])
        plists[playlist_name]['tempo'].append(features[0]['tempo'])
        plists[playlist_name]['valence'].append(features[0]['valence'])
    
# example call to the function (the 3 cohesive playlists are rap, sad, and instrumental)
uris = ['spotify:playlist:2XF4xx2KLOCRqB8GE4S48E', 'spotify:playlist:601xuhoIObcc1GWqe3dgtN', 'spotify:playlist:0J27PjFNHSi4XpbY8TEChh']
for uri in uris:
        get_features_for_playlist(uri)

print(plists)

NameError: name 'spotipy' is not defined

In [None]:
# STEP 2: make radar graph

import matplotlib.pyplot as plt

# manually inspect all of the values to determine whether the median or mean is a better metric to plot
for playlist in plists:
    print("––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––")
    print(playlist)
    for feature in plists[playlist]:
        if feature != 'name' and feature != 'track uri':
            print(feature.upper(), "| median:", np.median(plists[playlist][feature]), "| mean:", np.mean(plists[playlist][feature]))
    

labels = ['acousticness', 'danceability', 'energy', 'valence', 'instrumentalness', 'tempo', 'speechiness']
num_vars = len(labels)

# Split the circle into even parts and save the angles so we know where to put each axis.
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

# ax = plt.subplot(polar=True)
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

# Helper function to plot each playlist on the radar chart.
def add_to_radar(playlist, color):
    values = [np.median(plists[playlist]['acousticness']), np.median(plists[playlist]['danceability']), np.median(plists[playlist]['energy']), 
              np.median(plists[playlist]['valence']), np.mean(plists[playlist]['instrumentalness']), np.median(plists[playlist]['tempo']), 
              np.median(plists[playlist]['speechiness'])]
    # tempo values typically range from 50-220, so I divided by 220 to get a number between 0 and 1
    values[-2] = values[-2]/220
    # speechiness values values are highly concentrated between 0 and 0.25-ish, so I multiplied by 4. Adjust this if needed
    values[-1] = values[-1]*4
    values += values[:1]
    ax.plot(angles, values, color=color, linewidth=1, label=playlist)
    ax.fill(angles, values, color=color, alpha=0.25)

# # Add each additional playlist to the chart.
add_to_radar('Hype Stuff', 'red')
add_to_radar('sad', 'green')
add_to_radar('academia', 'blue')

# polar coordinates math stuff
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)

# Draw axis lines for each angle and label.
ax.set_thetagrids(np.degrees(angles), labels)

# Go through labels and adjust alignment based on where it is in the circle.
for label, angle in zip(ax.get_xticklabels(), angles):
  if angle in (0, np.pi):
    label.set_horizontalalignment('center')
  elif 0 < angle < np.pi:
    label.set_horizontalalignment('left')
  else:
    label.set_horizontalalignment('right')
    
# Set position of y-labels (0-100) to be in the middle of the first two axes.
ax.set_ylim(0, 1)
ax.set_rlabel_position(180 / num_vars)

# Add some custom styling.
ax.tick_params(colors='#222222')         # color of tick labels
ax.tick_params(axis='y', labelsize=8)    # y-axis labels
ax.grid(color='#AAAAAA')                 # color of circular gridlines
ax.spines['polar'].set_color('#222222')  # color of outermost gridline (spine)
ax.set_facecolor('#FAFAFA')              # background color inside the circle itself

#Lastly, give the chart a title and a legend
ax.set_title('Playlist Comparison', y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

fig.savefig('playlist_comp.png')

### Subproject 2: using Tableau to study the lyrical content of different albums by the same artist

In [None]:
# IMPORTS
import spotipy
import numpy
from PyLyrics import *
import re
import nltk
import os
import wordcloud
import matplotlib.pylab as plt
from collections import Counter

In [None]:
# the setup
# –––––––––––––––––––––––––––––––––––––––––––––––––––––
# this part is for you to fill in with your information
username='YOUR-SPOTIFY-USERNAME'
client_id = 'YOUR-CLIENT-ID-HERE'
client_secret = 'YOUR-CLIENT-SECRET-HERE'
#––––––––––––––––––––––––––––––––––––––––––––––––––––––
redirecturi='http://localhost:5000'
thescope='playlist-read-private'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
name = "Mumford And Sons" #chosen artist
result = sp.search(name) #search query
result['tracks']['items'][0]['artists']

In [None]:
# STEP 1: Extract Artist's uri

artist_uri = result['tracks']['items'][0]['artists'][0]['uri']
# Pull all of the artist's albums
sp_albums = sp.artist_albums(artist_uri, album_type='album')
# Store artist's albums' names' and uris in separate lists
album_names = []
album_uris = []
for i in range(len(sp_albums['items'])):
    if sp_albums['items'][i]['name'] not in album_names:
        album_names.append(sp_albums['items'][i]['name'])
        album_uris.append(sp_albums['items'][i]['uri'])   
album_names
album_uris

In [None]:
# STEP 2: get all albums

def albumSongs(uri):
    album = uri #assign album uri to a_name
    spotify_albums[album] = {} #Creates dictionary for that specific album
    #Create keys-values of empty lists inside nested dictionary for album
    spotify_albums[album]['album'] = [] #create empty list
    spotify_albums[album]['track_number'] = []
    spotify_albums[album]['id'] = []
    spotify_albums[album]['name'] = []
    spotify_albums[album]['uri'] = []
    tracks = sp.album_tracks(album) #pull data on album tracks
    for n in range(len(tracks['items'])): #for each song track
        spotify_albums[album]['album'].append(album_names[album_count]) #append album name tracked via album_count
        spotify_albums[album]['track_number'].append(tracks['items'][n]['track_number'])
        spotify_albums[album]['id'].append(tracks['items'][n]['id'])
        spotify_albums[album]['name'].append(tracks['items'][n]['name'])
        spotify_albums[album]['uri'].append(tracks['items'][n]['uri'])
        
spotify_albums = {}
album_count = 0
for i in album_uris: #each album
    albumSongs(i)
    print("Album " + str(album_names[album_count]) + " songs has been added to spotify_albums dictionary")
    album_count+=1 #Updates album count once all tracks have been added
    

In [None]:
# STEP 3: go through each album and get the song lyrics

songs = []
tags = [] # how we will keep track of which album each song comes from
albums = []
i = 0
for album in spotify_albums:
    name = spotify_albums[album]['name']
    flag = False
    print(name, end='\n\n')
    for n in name:
        if n not in songs:
            flag = True
            songs.append(n)
            tags.append(i)
        else:
            print('duplicate')
    if flag == True:
        albums.append(spotify_albums[album]['album'][0])
        i+=1
        
# extract the song lyrics and compile them for each album
print(songs)
print(albums)
artist = name
song_words = []
new_tags = []
new_albums = []
i = 0
print(list(zip(songs, tags)))
for s, t in list(zip(songs, tags)):
    try:
        lyrics = PyLyrics.getLyrics(artist,s)
        words = lyrics.split()
        song_words.append(words)
        print(s, 'finished')
        new_tags.append(t)
        new_albums.append(albums[i])
        i += 1
        print(t)
    except:
        # sometimes this may not work (e.g. songs recorded live do not have lyrics stored)
        print(s, 'exception')
        continue
tags = new_tags
albums = new_albums
print(tags)
albums = [a[0:a.find("(")] if a.find("(") != -1 else a for a in albums]
print(albums)

In [None]:
# STEP 4: clean the lyrics and make a corpus for each album

# Get a list of stopwords from nltk
stopwords = nltk.corpus.stopwords.words("english")

def get_clean_words(words):
    def _isnum(w):
        try:
            int(w)
            return True
        except ValueError:
            return False
        
    # Set words to lowercase and remove them if they are stop words
    words = [w.lower() for w in words if w.lower() not in stopwords]
    
    # Remove punctuation
    words = [w.replace('(', '') for w in words]
    words = [w.replace(')', '') for w in words]
    words = [w.replace('?', '') for w in words]
    words = [w.replace(',', '') for w in words]
    words = [w.replace('.', '') for w in words]
    words = [w.replace('"', '') for w in words]
    words = [w.replace('!', '') for w in words]

    # Remove numbers
    words = [w for w in words if not _isnum(w)]
    
    # Only keep words with more than one character
    words = [w for w in words if len(w) > 1]
    
    return words
    
    
def word_count(text):
    return Counter(text.split())

corpus = {}
word_counts = []
for i in range(len(tags)):
    if tags[i] == 5 or tags[i] == 8:
        tags[i] = 4
for t in tags:
    corpus[t] = []
for s, t in list(zip(song_words, tags)):
    clean_words = get_clean_words(s)
    corpus[t].append(' '.join(clean_words))
for t in corpus:
    corpus[t] = ' '.join(corpus[t])

In [None]:
# STEP 5: write all relevant data to a CSV file --> Tableau

with open('mumford.csv', 'w') as f:
    for key in corpus.keys():
        my_dict = word_count(corpus[key])
        print(key, my_dict["love"])
        for k in my_dict.keys():
            f.write("%s,%s,%s\n"%(albums[key],k,my_dict[k]))

In [None]:
# STEP 5.5 (optional): print stuff to see what we are working with / debug

print(tags)
print(albums)
print([albums[t] for t in tags])
print(corpus)


In [None]:
# STEP 6 (optional): make wordclouds using Python library because why not?

i = 0
for t in corpus:
    print(t)
    fig = plt.figure(figsize=(12, 18))
    plt.title(albums[t])
    wc = wordcloud.WordCloud(max_font_size=40, collocations=False).generate(corpus[t])
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    fig.savefig(albums[t] + '.png')
    i += 1

### Subproject 3: using Tableau to compare genres based on sentiment scores

In [None]:
# IMPORTS
import spotipy
import numpy
import re
import nltk
import os
import wordcloud
import matplotlib.pylab as plt
from collections import Counter
from afinn import Afinn

In [None]:
# the setup
# –––––––––––––––––––––––––––––––––––––––––––––––––––––
# this part is for you to fill in with your information
username='YOUR-SPOTIFY-USERNAME'
client_id = 'YOUR-CLIENT-ID-HERE'
client_secret = 'YOUR-CLIENT-SECRET-HERE'
#––––––––––––––––––––––––––––––––––––––––––––––––––––––
redirecturi='http://localhost:5000'
thescope='playlist-read-private'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

In [None]:
# STEPS 1 and 2: get lyrics for each playlist and clean the words

# Get a list of stopwords from nltk
stopwords = nltk.corpus.stopwords.words("english")

def get_clean_words(words):
    def _isnum(w):
        try:
            int(w)
            return True
        except ValueError:
            return False
        
    # Set words to lowercase and remove them if they are stop words
    words = [w.lower() for w in words if w.lower() not in stopwords]
    
    # Remove punctuation
    words = [w.replace('(', '') for w in words]
    words = [w.replace(')', '') for w in words]
    words = [w.replace('?', '') for w in words]
    words = [w.replace(',', '') for w in words]

    # Remove numbers
    words = [w for w in words if not _isnum(w)]
    
    # Only keep words with more than one character
    words = [w for w in words if len(w) > 1]
    
    return words
    
    
def word_count(text):
    return Counter(text.split())

def get_lyrics_for_playlist(uri):
    playlist_id = uri.split(':')[2]
    results = sp.user_playlist(username, playlist_id)
    lyrics = ""
    count = 0
    for track in results['tracks']['items']:
        #print(json.dumps(track, indent=4))
        song_name = track['track']['name']
        artist = track['track']['album']['artists'][0]['name']
        print(song_name, "|", artist)
        try:
            lyrics += PyLyrics.getLyrics(artist,song_name)
            print("finished")
            count += 1
        except:
            print("exception")
            continue
    
    words = get_clean_words(lyrics.split()) 
    print(count, "successfully scraped and cleaned")
    return words

#Punk Rock Classics
uri = 'spotify:playlist:3zl5C2k16lCsno1i2XcSCM'
words = get_lyrics_for_playlist(uri)

#Pop Classics
uri2 = 'spotify:playlist:55OXRZi5CH4pRD6YA8vrY2'
words2 = get_lyrics_for_playlist(uri2)

In [None]:
# STEPS 3 and 4: extract sentiment scores and write data to CSV file --> Tableau

afinn = Afinn()

words_list = [words, words2]
playlists = ['Punk Rock', 'Mood Booster']
print(words_list)
def word_count(text):
    return Counter(text.split())

with open('genre_comparison.csv', 'w') as f:
    i = 0
    for w in words_list:
        my_dict = word_count(' '.join(w))
        for key in my_dict.keys():
            f.write("%s,%s,%s,%s\n"%(playlists[i],key,my_dict[key],str(afinn.score(key))))
        i += 1