In [1]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
from datetime import date, timedelta
from tqdm import tnrange

In [2]:
# 2. url: we start with the 'first' page
url = "https://www.last.fm/tag/electronic/tracks?page=1"

In [3]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup


<!DOCTYPE html>

<html class="no-js playbar-masthead-release-shim youtube-provider-not-ready" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={xpid:"UwYPV15QGwYFXFlXDgU=",licenseKey:"0ed0ce50b0",applicationID:"5588594"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var i,o=t("ee"),a=t(25),s={};try{i=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,i.indexOf("dev")!==-1&&(s.dev=!0),i.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&o.on("internal-error",function(t){r(t.stack)}),s.dev

In [54]:
# Loop through the whole website with creating 20 different URLS
iterations = range(1, 21, 1)

for i in iterations:
    start_at= str(i)
    url = "https://www.last.fm/tag/electronic/tracks?page=" + start_at + "&ref_=adv_nxt"
    print(url)


https://www.last.fm/tag/electronic/tracks?page=1&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=2&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=3&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=4&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=5&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=6&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=7&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=8&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=9&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=10&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=11&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=12&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=13&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=14&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=15&ref_=adv_nxt
https://www.last.fm/tag/electronic/tracks?page=16&ref_=adv_nxt
h

In [36]:
# To make it more "human", we can randomize the waiting time:
from time import sleep
from random import randint

for i in range(20):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " seconds.")
    sleep(wait_time)


0
I will sleep for 4 seconds.
1
I will sleep for 1 seconds.
2
I will sleep for 2 seconds.
3
I will sleep for 1 seconds.
4
I will sleep for 3 seconds.
5
I will sleep for 4 seconds.
6
I will sleep for 2 seconds.
7
I will sleep for 4 seconds.
8
I will sleep for 2 seconds.
9
I will sleep for 4 seconds.
10
I will sleep for 2 seconds.
11
I will sleep for 1 seconds.
12
I will sleep for 2 seconds.
13
I will sleep for 2 seconds.
14
I will sleep for 4 seconds.
15
I will sleep for 1 seconds.
16
I will sleep for 2 seconds.
17
I will sleep for 1 seconds.
18
I will sleep for 4 seconds.
19
I will sleep for 4 seconds.


In [55]:
# here we will scrape all the page and store the response into a list

pages = []

for i in iterations:

    # assemble the url:
    start_at= str(i)
    url = "https://www.last.fm/tag/electronic/tracks?page=" + start_at + "&ref_=adv_nxt"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    sleep(wait_time)


Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.


In [56]:
# parse the first page of the newly scraped info (for testing)
soup = BeautifulSoup(pages[0].content, "html.parser")

In [57]:
# inspect where to find the title on the page 
soup.select("#top-tracks-section > table > tbody > tr:nth-child(1) > td.chartlist-name > a")

[<a class="" href="/music/MGMT/_/Kids" title="Kids">Kids</a>]

In [107]:
# isolate for the above info the name of the track
soup.select("#top-tracks-section > table > tbody > tr:nth-child(3) > td.chartlist-name > a")[0].text

'Enjoy the Silence'

In [104]:
len(pages)

20

In [110]:
# putting all the pages into the whole soup

pages_parsed = []
titles = []


for p in tqdm(pages):
    # create the soup for all the 20 pages!!
    soup = BeautifulSoup(p.content, "html.parser")
    # first isolate one title and then append all of them into the empty titles list!!
    for i in range(1,51):
        title = soup.select(f"#top-tracks-section > table > tbody > tr:nth-child({i}) > td.chartlist-name > a")[0].text
        titles.append(title)

In [115]:
len(titles)

1000

### The remaining work such as putting it titles into DF and appling the user input function to it comes later!!!!!