In [1]:
import pandas as pd
import requests
import json
import datetime
from sqlalchemy import create_engine
import time
from IPython.core.display import clear_output
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
### Last.fm Connection ###
LASTFM_KEY = os.environ.get('LASTFM_KEY')
LASTFM_SECRET = os.environ.get('LASTFM_SECRET')
LASTFM_USERNAME = 'sabbouma10'
LASTFM_NOW_PLAYING = '@attr'
LASTFM_URL = 'https://ws.audioscrobbler.com/2.0/'


### SQL Connection ###
SQL_PASSWORD = os.environ.get('SQL_PASSWORD')
SQL_DB = os.environ.get('SQL_DATABASE')
engine = create_engine('mysql+mysqlconnector://root:{password}@localhost:3306/{database}'.format(
    password = SQL_PASSWORD,
    database=SQL_DB
))
connection = engine.connect()

### Spotify Connection ###
client_id = os.environ.get('client_id')
client_secret = os.environ.get('client_secret')
username = os.environ.get('username')
playlist_id = os.environ.get('playlist_id')

scope = "playlist-read-private "
scope += "playlist-modify-public "
scope += "playlist-modify-private "
scope += "user-read-playback-state "
scope += "user-library-read "
scope += "user-read-recently-played"

token = util.prompt_for_user_token(username,scope,client_id=client_id,client_secret=client_secret,redirect_uri='https://developer.spotify.com/dashboard/applications/09d24c54f26846ffa2d5a8558bec67ec') 
sp = spotipy.Spotify(auth=token)

In [3]:
def lastfm_get(payload):
    headers = {'user-agent': LASTFM_USERNAME}

    payload['api_key'] = LASTFM_KEY
    payload['format'] = 'json'

    response = requests.get(LASTFM_URL, headers=headers, params=payload)
    return response

In [11]:
page = connection.execute("""
    SELECT
        COUNT(*)
    FROM
        scrobbles_raw
    """).fetchall()[0][0]

In [12]:
c//200

504

In [156]:
page = 1
total_pages = 99999
responses = []
old_page = -1
page_state = 0
with open("current_page_lastfm.txt", "r") as f:
    try:
        old_page = int(f.read()) - 1
        page_state = 1
    except ValueError as e:
        raise ValueError('File does not contain a valid page number')

while page <= total_pages:

    payload = {
        'method': 'user.getrecenttracks',
        'user':LASTFM_USERNAME,
        'limit':200,
        'page':page
    }

    # print some output so we can see the status
    print("Requesting page {}/{}".format(page, total_pages))
    # clear the output to make things neater
    clear_output(wait = True)

    # make the API call
    response = lastfm_get(payload)

    # if we get an error, print the response and halt the loop
    if response.status_code != 200:
        print(response.text)
        break
    if page_state == 1:
        total_pages = int(response.json()['recenttracks']['@attr']['totalPages'])
        total_pages = total_pages - old_page + 1
        page_state = 2
    elif page_state == 0:
        total_pages = int(response.json()['recenttracks']['@attr']['totalPages'])
        
    
    # extract pagination info
    page = int(response.json()['recenttracks']['@attr']['page'])
    

    # append response
    responses.append(response)

    # if it's not a cached result, sleep
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)

    # increment the page number
    page += 1

with open("current_page_lastfm.txt", "w") as f:
    f.write("{}".format(page+old_page))

Requesting page 1/99999


In [157]:
# data = r.json()

song_names = []
artist_names = []
album_names = []
timestamps = []

# Extracting only the relevant bits of data from the json object
for response in responses:
    data = response.json()      
    for song in data['recenttracks']["track"]:
        if LASTFM_NOW_PLAYING in song.keys():
            continue
        song_names.append(song["name"])
        artist_names.append(song["artist"]["#text"])
        album_names.append(song["album"]["#text"])
        timestamps.append(datetime.datetime.fromtimestamp(int(song['date']['uts'])))
    
# Prepare a dictionary in order to turn it into a pandas dataframe below       
song_dict = {
    "song_name" : song_names[::-1],
    "artist_name": artist_names[::-1],
    "album_name" : album_names[::-1],
    "timestamp" : timestamps[::-1]
}

song_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "album_name", "timestamp"])

while(len(song_df.loc[song_df.duplicated(subset=['timestamp']), 'timestamp'])>0):
    song_df.loc[song_df.duplicated(subset=['timestamp']), 'timestamp']+=datetime.timedelta(0,1)

In [159]:
req = connection.execute(
    """
        SELECT
        MAX(timestamp)
    FROM
        scrobbles_raw
    """
)
song_df = song_df.loc[song_df['timestamp']>req.fetchall()[0][0]]
song_df.to_sql('scrobbles_raw', con=connection, index=False, if_exists='append')

In [188]:
NAMES = ['Exmilitary',
    'The Perfect Prescription',
    'Microphones in 2020',
    '2049 (DELUXE EDITION)',
    'Guruh Gipsy']
LABELS = ['Third Worlds',
    'Glass',
    'P.W. Elverum & Sun',
    'Toasty Digital',
    'Pramaqua']
ARTS = ['https://e.snmc.io/i/fullres/w/261b220ac09be6686d078e5f7e1a27b1/7024368',
    'https://e.snmc.io/i/fullres/w/21af1a0b9cce1179443bd4559ead6385/1448700',
    'https://e.snmc.io/i/fullres/w/2a91f1444ba652361e2d58d57c2b33d9/8449827',
    'https://e.snmc.io/i/fullres/w/67e3581a08d9378d52383cec624e0011/8438288',
    'https://cdn2.albumoftheyear.org/345x/album/78896-guruh-gipsy.jpg']
YEARS = [2011, 1987, 2020, 2020, 1977]
RELEASE_DATES = ["2011-05-27", "1987-09-01","2020-08-07","2020-09-07","1977-12-01"]
N_TRACKS = [13, 8, 1, 18, 7]

local_files = {
    NAMES[i]:{
        'Label':LABELS[i],
        'Year':YEARS[i],
        'Release Date':datetime.datetime.strptime(RELEASE_DATES[i], "%Y-%m-%d"),
        'Album Art':ARTS[i],
        'No. Tracks':N_TRACKS[i],
        'Popularity':-1,
        'Spotify ID':-1
    } for i in range(len(NAMES))
}
FIX_KEYS = ['Green', 'Zombie','New Mexico','Mambo Nassau Remastered','Souvlaki','Wait Long By the River and the Bodies of Your Enemies Will Float By', 'Paebiru']
FIX_YEARS = [1986, 1977, 1982, 1981, 1993, 2005, 1975]
non_local_files = {FIX_KEYS[i]:FIX_YEARS[i] for i in range(len(FIX_KEYS))}

In [189]:
def add_albums(album_subset:list, all_albums:list):
    if len(album_subset)>0:
        response = sp.albums(album_subset)
        for album in response['albums']:
            all_albums.append(album)
        print("Completed requesting  {}/{} albums ".format(count, max_iter))
        clear_output(wait = True)
        album_subset = []
    return album_subset, all_albums

In [190]:
page = 1
total_pages = 9999
responses = []
limit_playlist = 100

while page <= total_pages:
    print("Requesting playlist page {}/{}".format(page, total_pages))
    clear_output(wait = True)
    response = sp.playlist_tracks(playlist_id, fields=None, limit=limit_playlist, offset=(page-1)*limit_playlist, market=None)
    if len(response['items']) == 0:
        print("Failed:\n{}".format(response))
        break
    total_pages = int(int(response['total'])/100)+1
    responses.append(response)
    page += 1

album_ids = [
    album['track']['album']['id']
    if album['track']['album']['id']!=None
    else album['track']
    for response in responses
    for album in response['items']
]

added_ats = [
    datetime.datetime.strptime(album['added_at'], "%Y-%m-%dT%H:%M:%SZ")
    for response in responses
    for album in response['items']
]

max_iter = len(album_ids)
count = 0
lim=20
all_albums = []
album_subset = []

while count<max_iter:
    album_id = album_ids[count]
    if type(album_id)!=str:
        album_subset, all_albums = add_albums(album_subset, all_albums)
        all_albums.append(album_id)
    elif len(album_subset)<lim:
        album_subset.append(album_id)
    else:
        album_subset, all_albums = add_albums(album_subset, all_albums)
        continue
    count+=1
    if count==max_iter:
        album_subset, all_albums = add_albums(album_subset, all_albums)

Completed requesting  497/497 albums 


In [191]:
album_names = []
artist_names = []
release_dates = []
years = []
n_tracks = []
labels = []
popularities = []
spotify_ids = []
album_arts = []

for album in all_albums:
    if album['type']=='track':
        name = album['album']['name']
        album_names.append(name)
        artist_names.append(album['artists'][0]['name'])
        if name in local_files.keys():
            local_album = local_files[name]
            release_dates.append(local_album['Release Date'])
            years.append(local_album['Year'])
            n_tracks.append(local_album['No. Tracks'])
            labels.append(local_album['Label'])
            popularities.append(local_album['Popularity'])
            spotify_ids.append(local_album['Spotify ID'])
            album_arts.append(local_album['Album Art'])
    
    else:
        name = album['name']
        album_names.append(name)
        artist_names.append(album['artists'][0]['name'])
        date = album['release_date']
        try:
            release_date = datetime.datetime.strptime(date, "%Y-%m-%d").date()
        except:
            try:
                release_date = datetime.datetime.strptime(date, "%Y").date()
            except:
                release_date = datetime.datetime.strptime(date, "%Y-%m").date()
        release_dates.append(release_date)
        if name in non_local_files.keys():
            years.append(non_local_files[name])     
        else:
            years.append(release_date.year)
        n_tracks.append(int(album['total_tracks']))
        labels.append(album['label'])
        popularities.append(int(album['popularity']))
        spotify_ids.append(album['id'])
        album_arts.append(album['images'][0]['url'])

album_dict = {
    "Album":album_names,
    "Album_Artist":artist_names,
    "Release_Date":release_dates,
    "Year":years,
    "No._Tracks":n_tracks,
    "Label":labels,
    "Popularity":popularities,
    "Spotify_ID":spotify_ids,
    "Album_Art":album_arts,
    "Date_Added":added_ats
}

spotify_album_df = pd.DataFrame(album_dict)

In [192]:
req = connection.execute(
    """
    SELECT
        Spotify_ID
    FROM
        spotify_albums_raw;
    """
)
ids = [i[0] for i in req.fetchall()]
spotify_album_df = spotify_album_df.loc[~spotify_album_df['Spotify_ID'].astype(str).isin(ids)]

In [193]:
req = connection.execute(
    """
    SELECT
        MAX(spotify_albums_raw.index)
    FROM
        spotify_albums_raw
    """
)
current_index = req.fetchall()[0][0]+1
spotify_album_df.index = range(current_index, current_index +len(spotify_album_df))

In [None]:
datetime(2021,11,15)

In [194]:
try:
    artists_df = pd.read_sql('artists_raw', con=connection)
    current_index = max(artists_df['index'])
    indexes = []
    new_indexes = []
    new_artists = []
    for i in spotify_album_df.index:
        try:
            indexes.append(artists_df.loc[artists_df['Name']==spotify_album_df.loc[i,'Album_Artist'], 'index'].iloc[0])
        except:
            current_index+=1
            indexes.append(current_index)
            new_indexes.append(current_index)
            new_artists.append(spotify_album_df.loc[i,'Album_Artist'])
    spotify_album_df['Album_Artist_ID'] = indexes
    new_artists_df = pd.DataFrame(new_artists, index=new_indexes, columns=['Name'])
except:
    new_artists_df = pd.DataFrame(spotify_album_df['Album_Artist'].unique(), columns=['Name'])
    spotify_album_df['Album_Artist_ID'] = [artists_df.loc[artists_df['Name']==i].index[0] for i in spotify_album_df['Album_Artist']]

new_artists_df.to_sql('artists_raw', con=connection, if_exists='append')
spotify_album_df.drop('Album_Artist',axis=1, inplace=True)
spotify_album_df.to_sql('spotify_albums_raw', con=connection, if_exists='append')

try:
    connection.execute(
        """
        ALTER TABLE `spotify`.`artists_raw` 
        CHANGE COLUMN `index` `index` BIGINT NOT NULL ,
        CHANGE COLUMN `Name` `Name` TEXT NOT NULL ,
        ADD PRIMARY KEY (`index`),
        ADD UNIQUE INDEX `index_UNIQUE` (`index` ASC) VISIBLE;
        ;
        """
    )
    connection.execute(
        """
        ALTER TABLE `spotify`.`spotify_albums_raw` 
        CHANGE COLUMN `index` `index` BIGINT NOT NULL ,
        CHANGE COLUMN `Album` `Album` TEXT NOT NULL ,
        CHANGE COLUMN `Album_Artist_ID` `Album_Artist_ID` BIGINT NOT NULL ,
        CHANGE COLUMN `Release_Date` `Release_Date` DATETIME NOT NULL ,
        CHANGE COLUMN `Year` `Year` BIGINT NOT NULL ,
        CHANGE COLUMN `No._Tracks` `No._Tracks` BIGINT NOT NULL ,
        CHANGE COLUMN `Label` `Label` TEXT NOT NULL ,
        CHANGE COLUMN `Popularity` `Popularity` BIGINT NOT NULL ,
        CHANGE COLUMN `Spotify_ID` `Spotify_ID` TEXT NOT NULL ,
        CHANGE COLUMN `Album_Art` `Album_Art` TEXT NOT NULL ,
        CHANGE COLUMN `Date_Added` `Date_Added` DATETIME NOT NULL ,
        ADD PRIMARY KEY (`index`),
        ADD UNIQUE INDEX `index_UNIQUE` (`index` ASC) VISIBLE;
        ;
        """
    )
except:
    None


In [195]:
itunes_songs = pd.read_csv("all_itunes_tracks_spot")
itunes_songs.loc[itunes_songs['Disk Number'].isna(), 'Disk Number'] = 1
itunes_songs['Disk Number'] = itunes_songs['Disk Number'].astype(int)
itunes_songs['Track Number'] = itunes_songs['Track Number'].astype(int)
itunes_songs['Time'] = itunes_songs['Time'].astype(int)

In [196]:
spotify_album_df = pd.read_sql('spotify_albums_raw', con=connection)
artists_df = pd.read_sql('artists_raw', con=connection)

In [197]:
track_names = []
spotify_song_ids = []
track_album_ids = []
artist_names = []
times = []
explicits = []
track_nos = []
disc_nos = []

for album in all_albums:
    if album['type']=='track':
        name = album['album']['name']
        artist = album['artists'][0]['name']
        tracks = itunes_songs.loc[itunes_songs['Album']==name]
        track_names+=list(tracks['Name'])
        spotify_song_ids+=[-1]*len(tracks)
        album_id = spotify_album_df.loc[(spotify_album_df['Album']==name)].index[0]
        track_album_ids+=[album_id]*len(tracks)
        artist_names+=[artist]*len(tracks)
        times+=list(tracks['Time'])
        explicits+=[-1]*len(tracks)
        track_nos+=list(tracks['Track Number'])
        disc_nos+=list(tracks['Disk Number'])
        
    else:
        album_id = spotify_album_df.loc[spotify_album_df['Spotify_ID']==album['id']].index[0]

        track_album_ids+=[album_id]*len(album['tracks']['items'])
        for track in album['tracks']['items']:
            track_names.append(track['name'])
            spotify_song_ids.append(track['id'])
            artist_names.append(track['artists'][0]['name'] )
            times.append(int(track['duration_ms']/1000))
            explicits.append(track['explicit'])
            track_nos.append(track['track_number'])
            disc_nos.append(track['disc_number'])
            

spotify_songs_dict = {
    "Name":track_names,
    "Artist":artist_names,
    "Time":times,
    "Explicit":explicits,
    "Track_No.":track_nos,
    "Disc_No.":disc_nos,
    "Spotify_Song_ID":spotify_song_ids,
    "Album_ID":track_album_ids
}
spotify_songs_df = pd.DataFrame(spotify_songs_dict)


In [199]:
req = connection.execute(
    """
    SELECT
        Spotify_Song_ID
    FROM
        spotify_tracks_raw;
    """
)
ids = [i[0] for i in req.fetchall()]
spotify_songs_df = spotify_songs_df.loc[~spotify_songs_df['Spotify_Song_ID'].astype(str).isin(ids)]

req = connection.execute(
    """
    SELECT
        MAX(spotify_tracks_raw.index)
    FROM
        spotify_tracks_raw
    """
)
current_index = req.fetchall()[0][0]+1
spotify_songs_df.index = range(current_index, current_index +len(spotify_songs_df))

spotify_songs_df['Artist_ID'] = [artists_df.loc[artists_df['Name']==i].index[0] if len(artists_df.loc[artists_df['Name']==i])>0 else -1 for i in spotify_songs_df['Artist']]
spotify_songs_df.drop('Artist',axis=1, inplace=True)

spotify_songs_df.to_sql('spotify_tracks_raw', con=connection, if_exists='append')