# Web Scraping 

https://www.npr.org/series/423302056/hidden-brain/archive

Using https://regex101.com/

## Libraries

In [1]:
import requests
import re
# import numpy as np
# import pandas as pd

## Helper function

In [2]:
def getMatchesByRegex(regex, info):
    resultList = []
    matches = re.finditer(regex, info, re.MULTILINE)
    print(matches)
    for matchNum, match in enumerate(matches, start=1):
        resultList.append(str(match.group()))
        
    return resultList

## Request

In [3]:
baseUrl = 'https://www.npr.org/series/423302056/hidden-brain/archive'

In [4]:
payload = {}
headers = { "Content-Type": "text/html" }

response = requests.request("GET", baseUrl, headers=headers, data = payload)

## Movie names

In [5]:
regex = r"(?<=\<h2\ class\=\"title\"\>\<a\ href\=)(.*)(?=\<\/a\>\<\/h2\>)"
rawMovieList = getMatchesByRegex(regex, str(response.text))

print('There are {0} movies in this list'.format(len(rawMovieList)))

<callable_iterator object at 0x7f513b370ad0>
There are 15 movies in this list


In [6]:
rawMovieList[6]

'"https://www.npr.org/2020/08/21/904715184/you-2-0-fresh-starts"  data-metrics=\'{"action":"Click Story Title","category":"Story List"}\' >You 2.0: Fresh Starts'

In [7]:
# str('You 2.0: Empathy Gym').replace('You 2.0: ', '').strip()

In [8]:
# regex = r"(?<=\>)(.*)(?=$)"

# movieNamesList = []
# for rawMovieName in rawMovieList:
#     movie = getMatchesByRegex(regex, str(rawMovieName).replace('You 2.0: ', '').strip())
#     movieNamesList.append(movie[0])

### List comprehensions and Lambda functions

https://www.programiz.com/python-programming/list-comprehension

In [9]:
regex = r"(?<=\>)(.*)(?=$)"

movieNamesList = list(map(lambda x: getMatchesByRegex(
                            regex, 
                            str(x).replace('You 2.0: ', '').strip()
                        )[0],
                    rawMovieList))

<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>
<callable_iterator object at 0x7f51380e2450>
<callable_iterator object at 0x7f5138371450>


In [10]:
movieNamesList

['Laughter: The Best Medicine',
 'The Halo Effect',
 'Why Nobody Feels Rich',
 'The Fee-for-Service Monster',
 'Empathy Gym',
 'WOOP, WOOP!',
 'Fresh Starts',
 'Loss And Renewal',
 "The Mind's Eye",
 'Our Pursuit of Happiness',
 'Edge Effect',
 'The Untold Story Of Lyndie B. Hawkins',
 'Romeo &amp; Juliet In Rwanda',
 'The Night That Lasted A Lifetime',
 'The Founding Contradiction']

## Movie release date

In [11]:
regex = r"(?<=\<time\ datetime\=)(.*)(?=\>\<span\ class\=)"
movieReleaseDateList = getMatchesByRegex(regex, str(response.text).replace('"', ''))

print('There are {0} movies in this list'.format(len(movieReleaseDateList)))

<callable_iterator object at 0x7f51380e2450>
There are 15 movies in this list


In [12]:
movieReleaseDateList

['2020-09-28',
 '2020-09-21',
 '2020-09-14',
 '2020-09-07',
 '2020-08-31',
 '2020-08-24',
 '2020-08-21',
 '2020-08-17',
 '2020-08-10',
 '2020-08-03',
 '2020-07-27',
 '2020-07-20',
 '2020-07-13',
 '2020-07-06',
 '2020-06-29']

## Putting it all together

In [13]:
movieList = list(zip(movieNamesList, movieReleaseDateList))

In [14]:
movieList

[('Laughter: The Best Medicine', '2020-09-28'),
 ('The Halo Effect', '2020-09-21'),
 ('Why Nobody Feels Rich', '2020-09-14'),
 ('The Fee-for-Service Monster', '2020-09-07'),
 ('Empathy Gym', '2020-08-31'),
 ('WOOP, WOOP!', '2020-08-24'),
 ('Fresh Starts', '2020-08-21'),
 ('Loss And Renewal', '2020-08-17'),
 ("The Mind's Eye", '2020-08-10'),
 ('Our Pursuit of Happiness', '2020-08-03'),
 ('Edge Effect', '2020-07-27'),
 ('The Untold Story Of Lyndie B. Hawkins', '2020-07-20'),
 ('Romeo &amp; Juliet In Rwanda', '2020-07-13'),
 ('The Night That Lasted A Lifetime', '2020-07-06'),
 ('The Founding Contradiction', '2020-06-29')]