In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('./data/songs.csv', index_col=0)

In [None]:
data.Title = data['Title'].str.lower()
data.Artist = data['Artist'].str.lower()

In [None]:
data.head()

# Get Spotify URI

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm_notebook
import spotipy
import spotipy.util as util
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import time

In [None]:
token = spotipy.oauth2.SpotifyClientCredentials(client_id='1daf48cd25554b06b98bf85588530a75', client_secret='177e923c04f34680a3e00a595a659077')
cache_token = token.get_access_token()
spotify = spotipy.Spotify(cache_token)
sp = spotipy.Spotify(auth=cache_token)

In [None]:
titles = list(data.Title)
artists = list(data.Artist)

In [None]:
spotify_uri = list()
errors = list()

In [None]:
def get_spotify_uri(title, artist):
    title_clean = re.sub(r"[,.;@#?!&$%()]+", ' ', title)
    title_clean = re.sub('\s+', ' ', title_clean).strip()
    artist_clean = re.sub('\s+', ' ', artist).strip()
    
    query = title_clean + " " + artist_clean
    
    search = sp.search(q=query, limit=50, offset=0, type='track', market='US')
    search_items = search['tracks']['items']
    
    for i in range(len(search_items)):
        spotify_title = search_items[i]['name']
        spotify_artist = search_items[i]['artists'][0]['name']
        
        spotify_title_clean = re.sub(r"[,.;@#?!&$%()]+", ' ', spotify_title)
        spotify_title_clean = re.sub('\s+', ' ', title_clean).strip().lower()
        spotify_artist_clean = spotify_artist.lower().strip().lower()
        
        fuzzy_title_match = fuzz.token_set_ratio(title_clean, spotify_title_clean)
        fuzzy_artist_match = fuzz.token_set_ratio(artist_clean, spotify_artist_clean)
        fuzzy_match = (fuzzy_title_match + fuzzy_artist_match) / 2

        if (fuzzy_title_match >= 90) and (fuzzy_artist_match >= 50) and fuzzy_match >= 75:
            uri = search_items[i]['id']
            return uri
    return 0

In [None]:
temp = list()

for i in tqdm_notebook(range(0, 1000)):
    uri = get_spotify_uri(titles[i], artists[i])
    
    if uri != 0:
        temp.append(uri)
    else:
        temp.append(uri)
        errors.append(i)
        
spotify_uri = spotify_uri + temp

In [None]:
data['URI'] = spotify_uri
data = data[data.URI != 0]

In [None]:
data.to_csv('songs_w_uri.csv')

# Get Spotify Features

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm_notebook
import spotipy
import spotipy.util as util
import re

In [None]:
token = spotipy.oauth2.SpotifyClientCredentials(client_id='1daf48cd25554b06b98bf85588530a75', client_secret='177e923c04f34680a3e00a595a659077')
cache_token = token.get_access_token()
spotify = spotipy.Spotify(cache_token)
sp = spotipy.Spotify(auth=cache_token)

In [None]:
data = pd.read_csv('../data/songs_w_uri.csv', index_col=0)

In [None]:
uris = list(data.URI)

danceability_list = list()
energy_list = list()
key_list = list()
loudness_list = list()
mode_list = list()
speechiness_list = list()
acousticness_list = list()
instrumentalness_list = list()
liveness_list = list()
valence_list = list()
tempo_list = list()
duration_list = list()
time_signature_list= list()

In [None]:
def get_audio_features(uri):
    search = sp.audio_features(uri)
    if search[0] == None:
        danceability_list.append(np.nan)
        energy_list.append(np.nan)
        key_list.append(np.nan)
        loudness_list.append(np.nan)
        mode_list.append(np.nan)
        speechiness_list.append(np.nan)
        acousticness_list.append(np.nan)
        instrumentalness_list.append(np.nan)
        liveness_list.append(np.nan)
        valence_list.append(np.nan)
        tempo_list.append(np.nan)
        duration_list.append(np.nan)
        time_signature_list.append(np.nan) 
        return ('Error on: ' + str(uri))
    
    search_list = search[0]
    
    danceability_list.append(search_list['danceability'])
    energy_list.append(search_list['energy'])
    key_list.append(search_list['key'])
    loudness_list.append(search_list['loudness'])
    mode_list.append(search_list['mode'])
    speechiness_list.append(search_list['speechiness'])
    acousticness_list.append(search_list['acousticness'])
    instrumentalness_list.append(search_list['instrumentalness'])
    liveness_list.append(search_list['liveness'])
    valence_list.append(search_list['valence'])
    tempo_list.append(search_list['tempo'])
    duration_list.append(search_list['duration_ms'])
    time_signature_list.append(search_list['time_signature'])

In [None]:
for i in tqdm_notebook(range(len(uris))):
    get_audio_features(uris[i])

In [None]:
data['Danceability'] = danceability_list
data['Energy'] = energy_list
data['Key'] = key_list
data['Loudness'] = loudness_list
data['Mode'] = mode_list
data['Speechiness'] = speechiness_list
data['Acousticness'] = acousticness_list
data['Instrumentalness'] = instrumentalness_list
data['Liveness'] = liveness_list
data['Valence'] = valence_list
data['Tempo'] = tempo_list
data['Duration'] = duration_list
data['Time_Signature'] = time_signature_list

In [None]:
data = data.dropna()

In [None]:
data.to_csv('songs_w_spotifyapi.csv')

# Get Year

In [None]:
data = pd.read_csv("../data/songs_w_spotifyapi.csv", index_col=0)

In [None]:
songs = list(data.Title)
artists = list(data.Artist)
years = list()
errors = list()

In [None]:
def get_song_year(title, artist):
    title_clean = re.sub(r"[,.;@#?!&$%()]+", ' ', title)
    title_clean = re.sub('\s+', ' ', title_clean).strip()
    artist_clean = re.sub('\s+', ' ', artist).strip()
    
    query = title_clean + " " + artist_clean
    
    try:
        search = sp.search(q=query, limit=50, offset=0, type='track')
        search_items = search['tracks']['items']
        year = search_items[0]['album']['release_date']
        return year
    except Exception:
        year = 0
        return year

In [None]:
for i in tqdm_notebook(range(len(songs))):
    year = get_song_year(songs[i], artists[i])
    
    if year != 0 :
        years.append(year)
    else:
        years.append(year)
        print("Errored on " + str(i))
        errors.append(i)    

In [None]:
new_years = list()

def clean_year(date):
    y = date.split('-')[0]
    return int(y)

In [None]:
for i in years:
    if i == 0:
        new_years.append(i)
    else:
        x = clean_year(i)
        new_years.append(x)

In [None]:
data['Release_Year'] = new_years
data = data[data.Release_Year != 0]

In [None]:
data.to_csv('songs_w_features_year.csv')

# Get Genre

In [None]:
data = pd.read_csv('../data/songs_w_features_year.csv', index_col=0)

In [None]:
artists = list(data.Artist)

In [None]:
def get_first_artist(artist):
    
    #Handling , or and
    for i in range(len(artist)):
        if artist[i] == ',' or artist[i] == '&':
              return artist[0:i]
        if artist[i:i+3] == 'and':
              return artist[0:i]
    return artist

#Helper function to take subgenres of each artist and find the most frequent common genre within potentially 1000 subgenres
def get_common_genres(test_subgenres, common_genre_keywords):
    
    final_genres = []
    genre_frequency_map = {}
  
    #Checking the subgenres to see if they match common genre keywords

    for keyword in common_genre_keywords:
        for subg in test_subgenres:
            if keyword in subg:
                final_genres.append(keyword)

    #If no final genres can be identified, return None.
    if len(final_genres) == 0:
        return "None"

    #Counting the number of each genre keyword and returning the one with the highest count (as the "common genre")

    for genre in final_genres:
        if genre in genre_frequency_map.keys():
            current_value = genre_frequency_map[genre]
            genre_frequency_map[genre] = current_value + 1
        else:
            genre_frequency_map[genre] = 1

    #Getting most frequent common genre.

    max_value = max(genre_frequency_map.values())  # maximum value
    max_keys = [k for k, v in genre_frequency_map.items() if v == max_value]
    return max_keys[0]

In [None]:
def get_genres_for_each_artist(artists):
    
    final_genre_list = []
    #Most popular American Music Genres
    common_genre_keywords = ["rap", "pop", "rock", "country", "alternative", "r&b" "latin", "edm", "seasonal", "jazz", "classical", "metal", "reggae"]
    
    for i in tqdm_notebook(range(len(artists))):
        artist_name = artists[i]
        #Clean Artist Name 
        artist_name = get_first_artist(artist_name)
        #Include stripping code of and etc. here
 
        #stripping regex symbols and getting the prominent artist name
      
        #artist_name = re.sub(r"[,.;@#?!&$-/]+", ' ', artist_name)
        #artist_name = re.sub('\s+', ' ', artist_name).strip()
        #artist_name = artist_name(' ')[0]
 
        #Using the spotify search functionality and extracting the list of subgenres.
        try:
            search = sp.search(q=artist_name, limit=1, offset=0, type='artist', market='US')
        except Exception:
            print('Spotify error for ' + str(i))
            final_genre_list.append('None')
            continue
            

        try:
            list_of_subgenres = search['artists']['items'][0]['genres']
            common_genre = get_common_genres(list_of_subgenres, common_genre_keywords)
            final_genre_list.append(common_genre)
        except IndexError:
            print('No Genre for ' + str(i))
            final_genre_list.append('None')
            continue
    
    return final_genre_list

In [None]:
genres_list = get_genres_for_each_artist(artists)

In [None]:
data['Genre'] = genres_list
data = data[data['Genre'] != 'None']

In [None]:
data.to_csv('songs_complete_data.csv')