#### Installing Packages

In [1]:
!pip3 install spotipy --upgrade
!pip3 install pillow
!pip3 install tensorflow

#### Importing Packages

In [2]:
import matplotlib.pyplot as plt
from matplotlib import image as mpimg
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import pandas as pd
import requests
import collections
import pickle as pkl
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split

## Using credentials to pull album art and popularity score (target variable) from Spotify's API (SpotiPy)

In [3]:
# User will have to create their own spotipy account and put their own CID and SECRET keys into the following variables
creds  = pd.read_csv('data/spotipy_credentials.csv', header = None)

cid    = creds[1][0]
secret = creds[1][1]

In [4]:
# Accessing API
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [5]:
# Creating list of playlists with independent artists to pull album art data from
fresh_finds_playlists = [
'https://open.spotify.com/playlist/37i9dQZF1DWT0upuUFtT7o',
'https://open.spotify.com/playlist/37i9dQZF1DWUFAJPVM3HTX',
'https://open.spotify.com/playlist/37i9dQZF1DX3u9TSHqpdJC',
'https://open.spotify.com/playlist/37i9dQZF1DWW4igXXl2Qkp',
'https://open.spotify.com/playlist/37i9dQZF1DX78toxP7mOaJ',
'https://open.spotify.com/playlist/37i9dQZF1DXdS3lvGe1GrT',
'https://open.spotify.com/playlist/37i9dQZF1DWYUfsq4hxHWP',
'https://open.spotify.com/playlist/37i9dQZF1DX6bBjHfdRnza',
'https://open.spotify.com/playlist/37i9dQZF1DX8C585qnMYHP',
'https://open.spotify.com/playlist/37i9dQZF1DXcWL5K0oNHcG',
'https://open.spotify.com/playlist/37i9dQZF1DWWjGdmeTyeJ6',
'https://open.spotify.com/playlist/37i9dQZF1DX7AqyNZFu97s',
'https://open.spotify.com/playlist/37i9dQZF1DXagUeYbNSnOA',
'https://open.spotify.com/playlist/37i9dQZF1DX5C8ObEZ48JQ',
'https://open.spotify.com/playlist/37i9dQZF1DX4Xz5lDbaehp',
'https://open.spotify.com/playlist/37i9dQZF1DX2ddCYH6QIK5',
'https://open.spotify.com/playlist/37i9dQZF1DX0KBgD4Jf5tY',
'https://open.spotify.com/playlist/37i9dQZF1DX5R53BjnKBjk',
'https://open.spotify.com/playlist/37i9dQZF1DX7vZYLzFGQXc',
'https://open.spotify.com/playlist/37i9dQZF1DXbDSHGzTpRHX',
'https://open.spotify.com/playlist/37i9dQZF1DWVhn3qoy98w6',
'https://open.spotify.com/playlist/37i9dQZF1DX34s4fg4Zx3Z',
'https://open.spotify.com/playlist/37i9dQZF1DX8pdK1PVpBQz',
'https://open.spotify.com/playlist/5CweKpXcP6I3p95u8zgIyb' #EDM
]

In [6]:
# looping through playlist lists to pull all tracks from each list
playlist_uris = []

for playlist in fresh_finds_playlists:
    playlist_uris.append(playlist.split("/")[-1].split("?")[0])

In [7]:
# Adding name of Genre to account for songs in multiple playlists
playlist_uri_to_name_dict = {}

for playlist_uri in playlist_uris:
    playlist_name = sp.user_playlist(user=None, playlist_id=playlist_uri, fields="name")['name']
    
    if playlist_name != "Fresh Finds":
        playlist_name = playlist_name.split(' ')[-1]
    else:
        playlist_name = 'All Genres'
        
    playlist_uri_to_name_dict[playlist_uri] = playlist_name

In [8]:
# Creating dict to append to
data={}
data['track_name'] = []
data['popularity'] = []
data['album_url'] = []
data['playlist_uri'] = []

# looping through each playlist
for playlist in playlist_uris:
    
    # looping through each track in playlist to append name, popularity rating (our target variable), and album url to pull artwork.
    for track in sp.playlist_tracks(playlist)["items"]:
        # track name
        data['track_name'].append(track["track"]["name"])
        # popularity of the track
        data['popularity'].append(track["track"]["popularity"])
        # Cover Art URL
        data['album_url'].append(track["track"]['album']['images'][0]['url'])
        # playlist URI for matching later
        data['playlist_uri'].append(playlist)

In [9]:
# creating dataframe from above dictionary
df = pd.DataFrame(data)

In [10]:
# adding in playlist genre
df['playlist_name'] = df.playlist_uri.replace(playlist_uri_to_name_dict)

In [11]:
# dropping any potential duplicates 
df = df.drop_duplicates().reset_index(drop = True)

In [12]:
# cleaning track names of characters for cleaner import and export
df['track_name_cleaned'] = df.track_name.str.replace("(?i)[^0-9a-z!?.;,@' -]",'')
df['track_name_cleaned'] = df.track_name_cleaned.str.replace('.','')
df['track_name_cleaned'] = df.track_name_cleaned.str.replace('*','')
df['track_name_cleaned'] = df.track_name_cleaned.str.strip()

df = df.loc[~(df.track_name_cleaned == '')]

In [13]:
# creating a key column to later link album artwork back to target variable ('popularity')
df['key'] = df['track_name_cleaned']+"_"+df['playlist_name']

In [14]:
# sorting data by trackname to match album art with popularity score
df = df.sort_values(by='key').reset_index(drop=True)

In [15]:
# Going through each URL, downloading the image, and saving it to album_art folder
error_list = []
for key, url in zip(df['key'], df['album_url']):
    
    try:
        img_data = requests.get(url).content
        with open(f'data/album_art/{key}.jpg', 'wb') as handler:
            handler.write(img_data)
    except:
        error_list.append(key)

In [16]:
df.to_csv('data/popularity_index.csv')

# DATA PULL PT. 2 - CHANGING PIXEL SIZE

Our first model took too much computational power, and was unable to exceed one epoch. If the pixels are reduced from >600 to 60, this should allow future models to run without complication

In [18]:
# reading in album_art folder to resize and therefore minimize computational complexity and time
for name in os.listdir('data/album_art'):
    try:
        img = Image.open('data/album_art/'+name)
        img = img.resize((60,60))
        img.save('data/album_art_resized/'+name)
    except:
        print(name+' did not import')