# Introduction

In [None]:
# Purpose: brief notebook that helped me quickly web scrape hundreds of hours of audio from NASA (well, technically University of Iowa) 
# Main topics covered: Web scraping (BeautifulSoup) 
# Date: 2/7/2019
# Author: Sami Ahmed

# Imports

In [2]:
# pandas 
import pandas as pd

# webscraping and regular expressions
import os
import requests
from bs4 import BeautifulSoup
import re

# suppress long warnings
import warnings
warnings.filterwarnings('ignore')

# Web Scraping 

In [3]:
# response is good, we can hit that endpoint 
url = 'https://emfisis.physics.uiowa.edu/Events/rbsp-a/burst/'
response = requests.get(url)
response.status_code


200

In [4]:
# store all text from page 
page = response.text
soup = BeautifulSoup(page, "lxml")

### *WARNING* Not necessary for my investigation; however, below are a few good ways to isolate parts of a website

In [24]:
for link in soup_audio.find_all('a'): 
    print(link)

<a href="?C=N;O=D">Name</a>
<a href="?C=M;O=A">Last modified</a>
<a href="?C=S;O=A">Size</a>
<a href="?C=D;O=A">Description</a>
<a href="/rbsp/audio/mp3/">Parent Directory</a>
<a href="2012/">2012/</a>
<a href="2013/">2013/</a>
<a href="2014/">2014/</a>
<a href="2015/">2015/</a>
<a href="2016/">2016/</a>
<a href="2017/">2017/</a>
<a href="2018/">2018/</a>
<a href="2019/">2019/</a>


In [25]:
audio_detail = [audio for audio in soup.find_all('a') if 'rbsp-a_burst_times' in str(audio)]

In [26]:
all_links_audio =  []
for links in soup_audio.find_all('a'): 
    all_links_audio.append(links['href'])

In [27]:
# delete uneeded items at beginning of list
del all_links_audio[0:5]

In [28]:
all_links_audio

['2012/', '2013/', '2014/', '2015/', '2016/', '2017/', '2018/', '2019/']

### *WARNING* Investigation resuming 

# ADDITONAL WARNING 
### If you run the run the code below it will take a very long time, you might not have memory adequate to store all the data on your computer. If you run into this issue, refer to my guide on how to run your code on an AWS EC2 (virtual machine) 

In [5]:
#file_url = 'https://space.physics.uiowa.edu/rbsp/audio/mp3/L2A/2017/01/01/'
file_url = 'https://space.physics.uiowa.edu/rbsp/audio/mp3/L2A/2017/01/'


In [6]:
# navigates to the nested folder structure in order to download each individual .mp3 (at scale) 
# I use this function in "crawler" function below
def deeper(file_url):
    get_data = (requests.get(file_url, verify=False).content)
    soup_page = BeautifulSoup(get_data, "lxml")
    all_links_audio =  []
    for links in soup_page.find_all('a'): 
        all_links_audio.append(links['href'])
    regex = re.compile(r'[0-9]+/|.*mp3$')    
    selected_files = list(filter(regex.match, all_links_audio))
    return selected_files

In [7]:
# filters through all the files to only retrieve files of extension .mp3 
def crawler(links):
    newlinks = []
    regex = re.compile(r'.*mp3$')
    regex2 = re.compile(r'/$')
    if len(list(filter(regex.match,links)))==len(links):
        return links
    for link in links:
        if regex.match(link):
            newlinks.append(link)
        else:
            deep = deeper(link)
            newlinks.extend([link+step for step in deep])
    return crawler(newlinks)       

In [8]:
mp3s_2013 = crawler(['https://space.physics.uiowa.edu/rbsp/audio/mp3/L2A/2013/'])

In [None]:
# save mp3's to pickle for use in separate notebook 
with open('all_mp3s_L2A.pkl', 'wb') as f:
    pickle.dump(mp3s_2013, f)