In [11]:
import pandas as pd

df_playlist_data = pd.read_csv('Spotify-User-Only-Playlist-Data\spotify_dataset.csv', on_bad_lines='skip')

df_playlist_data.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
#number of rows in df

print(len(df_playlist_data))

12891680


In [None]:
#drop duplicates in "artistname" column
#remove quotes around "artistname" to avoid error "KeyError: 'artistname'

df_playlist_data.columns = df_playlist_data.columns.str.replace('"', '').str.replace(' ', '_').str.lower()
#rename all column names to exclude _
df_playlist_data.columns = df_playlist_data.columns.str.replace('_', '')
#drop 'name' in each column
df_playlist_data.columns = df_playlist_data.columns.str.replace('name', '')

In [None]:
print(df_playlist_data.columns)

Index(['userid', 'artist', 'track', 'playlist'], dtype='object')


In [None]:
df_playlist_data = df_playlist_data.drop_duplicates(subset='artist', keep='first')

print(len(df_playlist_data))

289821


In [None]:
df_playlist_data

Unnamed: 0,userid,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
5,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010
6,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
...,...,...,...,...
12890906,488ead471813ae982119153f792e4a3e,So So Desi & PathiK,Aaja Ve,Starred
12890907,488ead471813ae982119153f792e4a3e,Sanjoy Deb & Sunidhi Chauhan,Ab Laut Aa,Starred
12890986,488ead471813ae982119153f792e4a3e,Pascal & Pearce feat. Juliet Harding,Disco Sun,Starred
12891098,488ead471813ae982119153f792e4a3e,Yinon Yahel Feat. Emmi,Jump - BrainDead Dubstep Remix,Starred


In [None]:
#drop all columns except for "artist"

df_playlist_data = df_playlist_data.drop(columns=['userid', 'track', 'playlist'])

df_playlist_data

Unnamed: 0,artist
0,Elvis Costello
1,Elvis Costello & The Attractions
2,Tiffany Page
5,Lissie
6,Paul McCartney
...,...
12890906,So So Desi & PathiK
12890907,Sanjoy Deb & Sunidhi Chauhan
12890986,Pascal & Pearce feat. Juliet Harding
12891098,Yinon Yahel Feat. Emmi


In [None]:
# trial version with only 10 rows to make sure it works
df_playlist_data_trial = df_playlist_data.head(10)

df_playlist_data_trial

Unnamed: 0,artist
0,Elvis Costello
1,Elvis Costello & The Attractions
2,Tiffany Page
5,Lissie
6,Paul McCartney
7,Joe Echo
10,The Breakers
12,The Coronas
13,Crowded House
17,Joshua Radin


## running the request for realsies

In [13]:
import requests
from requests.exceptions import SSLError, RequestException, HTTPError
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

class SpotifyAPI:
    def __init__(self, client_id, client_secret):
        self.client_id = client_id
        self.client_secret = client_secret
        self.access_token = self.get_spotify_token()

    def get_spotify_token(self):
        print("Retrieving new token...")
        auth_url = 'https://accounts.spotify.com/api/token'
        response = requests.post(auth_url, {
            'grant_type': 'client_credentials',
            'client_id': self.client_id,
            'client_secret': self.client_secret,
        })
        if response.status_code != 200:
            raise Exception(f"Failed to get token: {response.text}")
        return response.json()['access_token']

    def search_artist(self, artist_name):
        print(f"Starting search for artist: {artist_name}")
        search_url = 'https://api.spotify.com/v1/search'
        headers = {'Authorization': f'Bearer {self.access_token}', 'Content-Type': 'application/json'}
        params = {'q': artist_name, 'type': 'artist', 'limit': 1}

        try:
            response = requests.get(search_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            items = data['artists']['items']
            if not items:
                return artist_name, None, None

            artist_id = items[0]['id']
            artist_genre = items[0]['genres']
            return artist_name, artist_id, artist_genre
        except HTTPError as e:
            print(f"HTTP error for {artist_name}: {e}")
            return artist_name, "Error", "Error"
        except (SSLError, RequestException) as e:
            print(f"Request failed for {artist_name}: {e}")
            return artist_name, "Error", "Error"

    def fetch_artists_data(self, artists_names):
        print("Processing artists data...")
        results = []
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = {executor.submit(self.search_artist, name): name for name in artists_names}
            for future in as_completed(futures):
                artist_name = futures[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    print(f"Failed to process {artist_name}: {e}")
        return results

# Main execution block
client_id = 'e848c06844924aaaa446fd8eb330bc18'
client_secret = 'd39ecb0846de4813b44989a3d7ed9e23'
spotify_api = SpotifyAPI(client_id, client_secret)
spotify_data = df_playlist_data

if 'artist' in spotify_data.columns:
    artist_names = spotify_data['artist'].unique()  # Process unique artist names only
    artist_data = spotify_api.fetch_artists_data(artist_names)
    artist_df = pd.DataFrame(artist_data, columns=['artist', 'Artist ID', 'Genre'])
    spotify_data = spotify_data.merge(artist_df, on='artist', how='left')
    spotify_data['Genre'] = spotify_data['Genre'].apply(lambda x: ', '.join(x) if isinstance(x, list) else "Not Found")
else:
    print("Expected 'artist' column not found in the dataset.")

spotify_data


Retrieving new token...


SSLError: HTTPSConnectionPool(host='accounts.spotify.com', port=443): Max retries exceeded with url: /api/token (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))