In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import choice
import pickle
import pandas as pd
from collections import defaultdict
import lyricsgenius
from essential_generators import DocumentGenerator
import re
import collections

In [2]:
def get_keys():
    '''
    Access API keys from external file and placed in a list. 

    Parameters
    ----------
    None:

    Returns
    ----------
    API_KEYS : (list)
        Return API keys used for connected to Spotify and Genius APIs.
    '''
    f = open('api_keys.txt', 'r')
    API_KEYS = f.readlines()
    
    for idx, key in enumerate(API_KEYS):
        API_KEYS[idx] = key.replace('\n', '')
        
    return API_KEYS

In [3]:
def get_album_ids_from_query(album_search):
    '''
    Pluck unique album IDs from API query, iterrates through paginated query, return list of album IDs from query.

    Parameters
    ----------
    album_query: (dict)
        Dictionary querried from Spotify API.

    Returns
    ----------
    album_ids : (list)
        Return album IDs from API query.
    '''
    album_info = album_search['albums']['items']
    album_ids = []
    
    for n in range(len(album_info)):
        album_ids.append(album_info[n]['id'])
        
        
    return album_ids

In [4]:
#random gen functions for api query
def get_rand_offset():
    return choice(range(10))
def get_rand_word(gen):
    return gen.word()

In [5]:
def album_builder_test(album):
    album_dict = defaultdict(list)
    
    album_dict['artist'].append(album['artists'][0]['name'])
    album_dict['artist_id'].append(album['artists'][0]['id'])
    album_dict['album'].append(album['name'])
    album_dict['album_type'].append(album['album_type'])
    album_dict['album_id'].append(album['id'])
    album_dict['album_label'].append(album['label'])
    album_dict['upc_code'].append(album['external_ids']['upc'])
    album_dict['album_popularity'].append(album['popularity'])
    album_dict['release_date'].append(album['release_date'])
    album_dict['release_prec'].append(album['release_date_precision'])
    album_dict['genres'].append(album['genres'])
    album_dict['album_release'].append(album['name'] + ' by ' + album['artists'][0]['name'])

    tracks, track_ids = [], [] 
    for track in album['tracks']['items']:
        tracks.append(track['name'])
        track_ids.append(track['id'])
    
    
    return album_dict, tracks, track_ids

In [6]:
def decode_lyrics(s):
    s = s.encode('ascii', 'ignore')
    s = s.decode()
    s = s.replace('\n', ' ')
    s = s.replace('-', ' ')
    s = re.sub(r'[\[].*?[\)\]]', ' ', s)
    
    return s

In [7]:
def lyrics_builder(song_names, artist, album_id, genius):
    '''
    songs: lst
    artist: str
    genius: api obj
    '''
    lyrics_dict = defaultdict(list)

    lyrics_dict['album_id'] = album_id
    album_lyrics = ''
    for song in song_names:
        song_search = genius.search_song(song, artist)
        album_lyrics += song_search.lyrics
        
    lyrics_dict['lyrics'] = decode_lyrics(album_lyrics)
    
    return lyrics_dict

In [8]:
def genres_builder(album_id, genres):
    genre_dict = defaultdict(list)
    
    genre_dict['album_id'] = album_id
    genre_dict['genres'] = genres
    
    return genre_dict

In [9]:
def audio_features_builder(album_id, audio_features):
    audio_feat_dict = defaultdict(list)
    keys_to_pop = ['type', 'id', 'uri', 'track_href', 'analysis_url']
    track_feats = ['danceability', 'energy', 'key', 
                    'loudness', 'mode', 'speechiness', 'acousticness',
                   'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
    
    
    audio_feat_dict['album_id'] = album_id
    for track_info in audio_features:
        [track_info.pop(key) for key in keys_to_pop]
        
    counter = collections.Counter()
    
    for d in audio_features:
        counter.update(d)
        
    sum_ = dict(counter)
    for k, v in sum_.items():
        audio_feat_dict[k] = v / len(audio_features)

    return audio_feat_dict

# Test and build df

In [10]:
# CONST and API instantiation

client_id, client_secret, client_access_token = get_keys()
gen = DocumentGenerator()
q_type='album'
limit = 50
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                          client_secret=client_secret))
genius = lyricsgenius.Genius(client_access_token=client_access_token)

## ---Test single album ---

In [11]:
test_id = '4yP0hdKOZPNshxUOjY0cZj'
# album_tracks = sp.album_tracks(test_id)
album = sp.album(test_id)

## --- Test complete ---

In [12]:
album_dict, tracks, track_ids = album_builder_test(album)
#to mongo, to genius + album_dict['artist'][0], to get features

In [13]:
artist = album_dict['artist'][0]
artist_id = album_dict['artist_id'][0]
album_id = album_dict['album_id'][0]

In [14]:
genres = sp.artist(artist_id)['genres']

In [15]:
audio_features = sp.audio_features(track_ids)

In [16]:
genres_dict = genres_builder(album_id, genres)

In [17]:
lyrics_dict = lyrics_builder(tracks, artist, album_id, genius)
# to mongo

Searching for "Alone Again" by The Weeknd...
Done.
Searching for "Too Late" by The Weeknd...
Done.
Searching for "Hardest To Love" by The Weeknd...
Done.
Searching for "Scared To Live" by The Weeknd...
Done.
Searching for "Snowchild" by The Weeknd...
Done.
Searching for "Escape From LA" by The Weeknd...
Done.
Searching for "Heartless" by The Weeknd...
Done.
Searching for "Faith" by The Weeknd...
Done.
Searching for "Blinding Lights" by The Weeknd...
Done.
Searching for "In Your Eyes" by The Weeknd...
Done.
Searching for "Save Your Tears" by The Weeknd...
Done.
Searching for "Repeat After Me (Interlude)" by The Weeknd...
Done.
Searching for "After Hours" by The Weeknd...
Done.
Searching for "Until I Bleed Out" by The Weeknd...
Done.


In [18]:
features_dict = audio_features_builder(album_id, audio_features)

### Testing dictionaries for mongo db

In [19]:
album_dict

defaultdict(list,
            {'artist': ['The Weeknd'],
             'artist_id': ['1Xyo4u8uXC1ZmMpatF05PJ'],
             'album': ['After Hours'],
             'album_type': ['album'],
             'album_id': ['4yP0hdKOZPNshxUOjY0cZj'],
             'album_label': ['Republic Records'],
             'upc_code': ['00602508924224'],
             'album_popularity': [96],
             'release_date': ['2020-03-20'],
             'release_prec': ['day'],
             'genres': [[]],
             'album_release': ['After Hours by The Weeknd']})

In [20]:
lyrics_dict

defaultdict(list,
            {'album_id': '4yP0hdKOZPNshxUOjY0cZj',
             'lyrics': "  Take off my disguise I'm living someone else's life Suppressing who I was inside So I throw twothousandones in thesky Together we're alone (Together we're alone) InVegas, I feel so at home (In Vegas, I feelsoathome) I'm falling onlyfor the night SoI throw two thousand ones in the sky (The sky) Oh, oh, oh, how much to light up my star again And rewire all my thoughts? Oh, baby, won't you remind me what I am And break, break my little cold heart?    Oh, oh, oh, oh Oh oh, oh oh oh, oh, oh oh Oh oh, oh, oh oh oh, oh Oh, oh oh oh, oh    Call me up and I'll send for you Take me down to your altitude I don't know if I can be alone again (Alone again) I don't know if I can sleep alone again (Alone again) Check my pulse for a second time (A second time) I took too much, I don't wanna die I don't know if I can be alone again (Alone again) I don't know if I can sleep alone again    Woah oh oh oh oh Woah

In [23]:
genres_dict

defaultdict(list,
            {'album_id': '4yP0hdKOZPNshxUOjY0cZj',
             'genres': ['canadian contemporary r&b', 'canadian pop', 'pop']})

In [24]:
features_dict

defaultdict(list,
            {'album_id': '4yP0hdKOZPNshxUOjY0cZj',
             'danceability': 0.5012857142857142,
             'energy': 0.6405000000000001,
             'key': 3.2857142857142856,
             'loudness': -7.085928571428572,
             'mode': 0.42857142857142855,
             'speechiness': 0.06860714285714285,
             'acousticness': 0.12228642857142857,
             'instrumentalness': 0.011832464999999999,
             'liveness': 0.1935642857142857,
             'valence': 0.25056428571428574,
             'tempo': 121.55292857142857,
             'duration_ms': 241255.2857142857,
             'time_signature': 3.9285714285714284})

In [25]:
album_df = pd.DataFrame(album_dict)
album_df

Unnamed: 0,artist,artist_id,album,album_type,album_id,album_label,upc_code,album_popularity,release_date,release_prec,genres,album_release
0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,After Hours,album,4yP0hdKOZPNshxUOjY0cZj,Republic Records,602508924224,96,2020-03-20,day,[],After Hours by The Weeknd


In [26]:
lyric_df = pd.DataFrame(lyrics_dict, index=[0])
lyric_df

Unnamed: 0,album_id,lyrics
0,4yP0hdKOZPNshxUOjY0cZj,Take off my disguise I'm living someone else...


In [27]:
genres_df = pd.DataFrame(genres_dict)
genres_df

Unnamed: 0,album_id,genres
0,4yP0hdKOZPNshxUOjY0cZj,canadian contemporary r&b
1,4yP0hdKOZPNshxUOjY0cZj,canadian pop
2,4yP0hdKOZPNshxUOjY0cZj,pop


In [28]:
feat_df = pd.DataFrame(features_dict, index=[0])
feat_df

Unnamed: 0,album_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,4yP0hdKOZPNshxUOjY0cZj,0.501286,0.6405,3.285714,-7.085929,0.428571,0.068607,0.122286,0.011832,0.193564,0.250564,121.552929,241255.285714,3.928571


# create list of album ids (+ old 511 new) shoot for 100k

In [None]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['fraud_analysis']
db['predictions']
db['new']
coll = db['data_raw']

path_to_json = '../data/data.json'

with open(path_to_json) as f:
    file_data = json.load(f)

coll.insert_many(file_data)

client.close()

In [None]:
client = MongoClient('localhost', 27017)
db = client['album_info']
db['album_id_master']
db['albums']
db['lyrics']
db['genres']
db['feat_df']
master_coll = db['album_master_id']
album_coll = db['albums']
lyric_coll = db['lyrics']
genre_coll = db['genres']
feat_coll = db['feat_df']


In [None]:
'''dicts for mongo'''
album_coll.insert_one(album_dict)

In [None]:
lyric_coll.insert_one(lyrics_dict)

In [None]:
genre_coll.insert_one(genres_dict)

In [None]:
feat_coll.insert_one(features_dict)

# automate process

In [39]:
plucked_album_ids = []
MASTER_IDS = []
dfs = pickle.load(open('dfs_w_lyrics.pkl', 'rb'))
df = pd.concat(dfs)
album_ids_baseline = df['album_id'].unique()

In [None]:
for _ in range(100):
    album_query = sp.search(q=get_rand_word(gen), limit=limit, offset=get_rand_offset(), type=q_type)
    plucked_album_ids.append(get_album_ids_from_query(album_query))

## test unique album pulls

In [29]:
plucked_album_ids = []
BASELINE_ID_LIST = []
dfs = pickle.load(open('dfs_w_lyrics.pkl', 'rb'))
df = pd.concat(dfs)
album_ids_baseline = df['album_id'].unique()

In [None]:
for album_ids in album_ids_baseline:
    BASELINE_ID_LIST.append(album_ids)

In [30]:
for _ in range(10000):
    album_query = sp.search(q=get_rand_word(gen), limit=limit, offset=get_rand_offset(), type=q_type)
    plucked_album_ids.append(get_album_ids_from_query(album_query))
# flattened_album_ids = [id_ for sublist in plucked_album_ids for id_ in sublist]

In [31]:
flattened_album_ids = [id_ for sublist in plucked_album_ids for id_ in sublist]
unique_album_ids = pd.Series(flattened_album_ids).nunique()

In [32]:
print(f'{len(flattened_album_ids)} plucked, {unique_album_ids} are unique')

435215 plucked, 156285 are unique


In [46]:
del flattened_album_ids
del plucked_album_ids

In [33]:
for id_ in MASTER_IDS:
    if id_ not in set(flattened_album_ids):
        flattened_album_ids.append(id_)

In [34]:
unique_album_ids = pd.Series(flattened_album_ids).nunique()
print(f'{len(flattened_album_ids)} plucked, {unique_album_ids} are unique')

435215 plucked, 156285 are unique


In [38]:
unique_ids = pd.Series(flattened_album_ids).unique()

In [39]:
unique_ids[:10]

array(['3RoG4xBbhThunES2bojEm7', '1awdBA8DLv6G5eBvdGxk8T',
       '0XHpO9qTpqJJQwa2zFxAAE', '5bcobCRR5ovNoZWAp3iJgG',
       '1coFYbyXU2dxO3lrUsbdIy', '6Zho4ar8UMxJLMDpWcLGto',
       '07OjWJVHVzsbZ5ytXhj7ou', '6xBdoRdtUb2Wl23WkyJGQj',
       '7nWW2h7SFTPQJgJX0h1IMA', '4hxJ8XzBPYGHeHRFhEd42y'], dtype=object)

In [42]:
!ls -ahl

total 24M
drwxrwxr-x 5 ubuntu ubuntu 4.0K Sep 20 17:58 .
drwxrwxr-x 3 ubuntu ubuntu 4.0K Sep 20 15:46 ..
drwxrwxr-x 2 ubuntu ubuntu 4.0K Sep 20 16:00 .ipynb_checkpoints
-rw-rw-r-- 1 ubuntu ubuntu 4.8M Sep 20 17:59 MASTER_ALBUM_IDS.pkl
-rw-r--r-- 1 ubuntu ubuntu 4.2K Sep 20 14:05 album_scraper.py
-rw-r--r-- 1 ubuntu ubuntu  131 Sep 20 14:05 api_keys.txt
drwxrwxr-x 4 ubuntu ubuntu 4.0K Sep 12 02:28 bin
-rw-r--r-- 1 ubuntu ubuntu  18M Sep 20 14:05 dfs_w_lyrics.pkl
-rw-rw-r-- 1 ubuntu ubuntu  83K Sep 20 14:05 final_albums_df.pkl
drwxrwxr-x 2 ubuntu ubuntu 4.0K Sep 11 21:51 logs
-rw-rw-r-- 1 ubuntu ubuntu  16K Sep 20 14:05 my_personal_albums.ipynb
-rw-r--r-- 1 ubuntu ubuntu 661K Sep 20 14:05 scripting_test.ipynb
-rw-rw-r-- 1 ubuntu ubuntu  51K Sep 20 17:58 server_script_test.ipynb


In [40]:
pickle.dump(unique_ids, open('MASTER_ALBUM_IDS.pkl', 'wb'))

## test stop

In [10]:
GROWING_IDs = MASTER_IDS.copy()

In [11]:
len(GROWING_IDs)

511

In [10]:
# move out of for loop and test single track
# After Hours:
# album_id = '4yP0hdKOZPNshxUOjY0cZj'
# Alone Again:
# track_id = '6b5P51m8xx2XA6U7sdNZ5E'

for album_ids in flattened_album_ids:
    if album_id not in MASTER_IDS:
        GROWING_IDs.append(album(id))
        continue
        album_tracks = sp.album_tracks(album_id)
        album = sp.album(album_id)
        album_dict = album_builder(album, album_tracks)
        album_df = pd.DataFrame(album_dict)
        # TODO: load to mongo as dict
    
    #TODO: test album
    for idx, row in album_df.iterrows():
        lyric_instance = lyrics_builder(row.track_name,
                                        row.lead_artist,
                                        row.track_id,
                                        client_access_token)
        # TODO: load to mongodb as dict