# Second attempt at downloading all scrobbled songs from LastFM and formatting them

Aim of attempt 1 is to get a formatted pandas dataframe of information to be refactored later
Initial Requirements are:
Song name
Artist name
Date and time of song playing

Got working on single page and tested on many - now refactoring across many pages.

Later requirements will include login, error check and be parameterised so it works for other users.

## To-do list

- (done) Get cleaner artist and timestamp scrape 
- (done) Loop over all last fm songs
- (done) Remove song titles after "-" as it is normally something like "remastered" 
- (done) Export data
- (done) Create field that is current timestamp (needed to refactor time later
- (done) Refactor
- Do I need to login for this to work?
- Save to G drive at the end of the day!

### To-do list moved to another notebook, part of data preparation
- Get better time stamp format and transform "hours since" timings (see last block of commented code)
- Get time between songs and flag skipped songs
- Make some generic time features (time of day, weekend etc.)
- Bring in metadata for songs

## For future iterations

- Login to lastfm (if required)
- Make part of a managed folder (must learn how first...)
- Append new data don't keep recreating whole data set

## Install Packages

In [1]:
import re           # regular expressions
import requests     # request web pages
import bs4          # 'beautiful soup 4' - find elements within HTML
import lxml         # HTML parser
import pandas as pd # allows making dataframes of results
import numpy as np
from datetime import datetime, date

# Parameters

In [2]:
## user input parameters
lastfm_username_list = ['COKUNUBI','rosiedempsey93']

## last fm determined parameters

# start and end of URL
lastfm_starturl = 'https://www.last.fm/user/'
lastfm_endurl = '/library?date_preset=ALL&page='

# headers for dataframe
column_names = ['song', 'artist', 'play_time']

# where data exports to
export_path_version = "/Users/rosiedempsey/Desktop/MusicProject/finely_tuned/DataExports/RawScrobbles_"+str(date.today())+".csv"
export_path_master = "/Users/rosiedempsey/Desktop/MusicProject/finely_tuned/DataExports/RawScrobbles_master.csv"


# Functions

In [3]:
# Find links for all pages containing songs
# First find the number of pages
# function find maximum page
def find_max_page(lastfm_page, lastfm_username):
    """
    Open any lastfm music page and  req then soup to get max page
    """
    lastfm_example_page = lastfm_starturl+lastfm_username+lastfm_endurl+'2'
    example_req = requests.get(lastfm_example_page)
    example_soup = bs4.BeautifulSoup(example_req.text, "lxml")
    pages_list = []
    for item in example_soup.find_all("li",{"class":"pagination-page"}):
        try:
            pages_list.append(int(item.text.strip('\n')))
        except ValueError:
            pass
    print("max page: "+str(max(pages_list)))
    return max(pages_list)

# Make list of all pages
# Function for making list
def all_pages_list(max_page, lastfm_username):
    lastfm_base = lastfm_starturl + lastfm_username + lastfm_endurl
    page_list = []
    for i in range(1,max_page+1):
        page_list.append(lastfm_base+str(i))
    return page_list

# Make list of requests and soups for all pages
# make function for a list of webapges
def cooking_many_soup(page_list):
    """
    First request all the pages and store requests in list
    The iterate of list of requests to make list of soups
    Output is a list
    """
    many_requests = []
    for page in page_list:
        many_requests.append(requests.get(page))
    
    many_soup = []
    for req in many_requests:
        many_soup.append(bs4.BeautifulSoup(req.text, "lxml"))
    
    return many_soup


# return a list for each bit of information we want (artist, song, time)

# songs - remove new line characters and anything after a "-" as it is normally something like "remasterd 1999"
# function
def get_songs_list(many_soup):
    song_list= []
    for soup in many_soup:
        for item in soup.find_all("td",{"class":"chartlist-name"}):
            song_list.append(item.text.replace("\n", "").partition("-")[0])
    return song_list

# artists
# function
def get_artist_list(many_soup):
    artist_list= []
    for soup in many_soup:
        for item in soup.find_all("td",{"class":"chartlist-artist"}):
            artist_list.append(item.text.replace("\n", ""))
    return artist_list

# timestamps
# function
def get_timestamp_list(many_soup):
    timestamp_list= []
    for soup in many_soup:
        for item in soup.find_all("td",{"class":"chartlist-timestamp"}):
            timestamp_list.append(item.text.replace("\n", "").replace(" ", "").replace("\xa0", ""))
    return timestamp_list

In [4]:
# Pipeline functions
def scrape_listening_history(lastfm_username):
    """
    Make soup for one page with chosen username and check
    Find max_page for that user
    Make list of all urls
    Apply soupy function
    Get required data: songs, artists, timestamp and save
    """
    lastfm_example_page = lastfm_starturl+lastfm_username+lastfm_endurl+'2'
    example_req = requests.get(lastfm_example_page)
    print("Error status: "+str(example_req.raise_for_status()))    # returns error if page not found
    max_page = find_max_page(lastfm_example_page,lastfm_username)
    all_pages = all_pages_list(max_page,lastfm_username)
    my_soups = cooking_many_soup(all_pages)
    
    my_songs = get_songs_list(my_soups)
    my_artists = get_artist_list(my_soups)
    my_times = get_timestamp_list(my_soups)
    my_list = [my_songs,my_artists,my_times]
    return my_list

def make_history_dataframe(scraped_data, data_headers, lastfm_username):
    """
    Make pandas dataframe scraped data
    Add a download time field
    """
    my_scrobbles = pd.DataFrame(np.column_stack(scraped_data), 
                                   columns=data_headers)
    my_scrobbles['download_time'] = datetime.now()
    my_scrobbles['username'] = lastfm_username
    return my_scrobbles

def export_history(scrobbles_df, export_path_version, export_path_master):
    """
    Export a versioned and master of the data
    Contains user name
    """
    scrobbles_df.to_csv(export_path_version, index=False)
    scrobbles_df.to_csv(export_path_master, index=False)


def export_multiple_histories_pipeline(lastfm_username_list,column_names,export_path_version,export_path_master):
    """
    Takes any usernames listed
    Scrapes their data into a pandas dataframe and exports
    """
    scrobbles_dfs = []
    for last_fm_username in lastfm_username_list:
        scraped_data = scrape_listening_history(last_fm_username)
        scraped_df = make_history_dataframe(scraped_data,column_names,last_fm_username)
        scrobbles_dfs.append(scraped_df)
        # see pd.concat documentation for more info
    all_scrobbles = pd.concat(scrobbles_dfs)
    export_history(all_scrobbles, export_path_version, export_path_master)

# Run code

In [None]:
export_multiple_histories_pipeline(lastfm_username_list,column_names,export_path_version,export_path_master)

Error status: None
max page: 419


In [None]:
read_test = pd.read_csv(export_path_version)
read_test.head()

In [None]:
read_test.tail()