###### Extract paginated webpages

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
root = 'https://subslikescript.com'
links = []

In [3]:
## Get the first page of the movie
website = f'{root}/movies_letter-A'
result = requests.get(website)
content = result.text
soup = BeautifulSoup(content,'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-120598793-1">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-120598793-1');
  </script>
  <meta charset="utf-8"/>
  <title>
   Movies Transcripts Starting with "A"
	    | Subs like Script
  </title>
  <meta "="" a"="" content="List of movie scripts starting with the letter " name="description"/>
  <meta content="transcript, the first letter, movie, subtitles, scripts, film, video, media, subs, srt " name="keywords"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="index, follow" name="robots"/>
  <link href="/favicon.ico" rel="shortcut icon"/>
  <meta content="xgFl6lQzeh9VBpoN6IIGFfnO9dtwINF9vM29NK9C" name="csrf-token"/>
  <link as="style" href="https://sub

###### Get Pagination

In [5]:
pagination = soup.find('ul',{'class':'pagination'})
pagination

<ul class="pagination">
<li aria-disabled="true" aria-label="« Previous" class="page-item disabled">
<span aria-hidden="true" class="page-link">‹</span>
</li>
<li aria-current="page" class="page-item active"><span class="page-link">1</span></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=2">2</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=3">3</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=4">4</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=5">5</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=6">6</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=7">7</a></li>
<li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_lette

In [6]:
pages = pagination.find_all('li',{'class':'page-item'}) if pagination else []
pages

[<li aria-disabled="true" aria-label="« Previous" class="page-item disabled">
 <span aria-hidden="true" class="page-link">‹</span>
 </li>,
 <li aria-current="page" class="page-item active"><span class="page-link">1</span></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=2">2</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=3">3</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=4">4</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=5">5</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=6">6</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?page=7">7</a></li>,
 <li class="page-item"><a class="page-link" href="https://subslikescript.com/movies_letter-A?p

In [7]:
last_page = int(pages[-2].text) if pages else 1
last_page

148

###### Loop through first page

In [8]:
for page in range(1,2):
    website = f'{root}/movies_letter-A?page={page}'
    result = requests.get(website)
    content = result.text
    soup = BeautifulSoup(content,'html.parser')
    box = soup.find('article',{'class':'main-article'})
    ## Collect all links
    for link in box.find_all('a',href = True):
        links.append(link['href'])       

In [9]:
links

['/movie/A_1000000000000000_Ransom-4926482',
 '/movie/A_2nd_Chance-2150139',
 '/movie/A_Aa-5684466',
 '/movie/A_Baby_at_any_Cost-15331880',
 '/movie/A_Babysitters_Guide_to_Monster_Hunting-4844150',
 '/movie/A_Bad_Son-81678',
 '/movie/A_Bag_of_Marbles-153414',
 '/movie/A_Balloon_for_Allah-1880111',
 '/movie/A_Banana_At_This_Time_of_Night-9010228',
 '/movie/A_Banquet-11400902',
 '/movie/A_Barefoot_Dream-1583213',
 '/movie/A_Bear_Named_Winnie-437088',
 '/movie/A_Beautiful_Curse-12593190',
 '/movie/A_Beautiful_Day_in_the_Neighborhood-3224458',
 '/movie/A_Beautiful_Life-15282148',
 '/movie/A_Beautiful_Mind-268978',
 '/movie/A_Beautiful_Now-2611160',
 '/movie/A_Beautiful_Place_to_Die_A_Marthas_Vineyard_Mystery-10768536',
 '/movie/A_Beautiful_Planet-2800050',
 '/movie/A_Beautiful_Secret-316562',
 '/movie/A_Bee_in_August-970946',
 '/movie/A_Belfast_Story-2326204',
 '/movie/A_Belle_for_Christmas-3256812',
 '/movie/A_Beloved_Wife-11288650',
 '/movie/A_Bennett_Song_Holiday-8552834',
 '/movie/A_Be

###### Sanitize

In [10]:
def sanitize(title):
    return re.sub(r'[<>:"/\\|?*]','',title)

In [11]:
links

['/movie/A_1000000000000000_Ransom-4926482',
 '/movie/A_2nd_Chance-2150139',
 '/movie/A_Aa-5684466',
 '/movie/A_Baby_at_any_Cost-15331880',
 '/movie/A_Babysitters_Guide_to_Monster_Hunting-4844150',
 '/movie/A_Bad_Son-81678',
 '/movie/A_Bag_of_Marbles-153414',
 '/movie/A_Balloon_for_Allah-1880111',
 '/movie/A_Banana_At_This_Time_of_Night-9010228',
 '/movie/A_Banquet-11400902',
 '/movie/A_Barefoot_Dream-1583213',
 '/movie/A_Bear_Named_Winnie-437088',
 '/movie/A_Beautiful_Curse-12593190',
 '/movie/A_Beautiful_Day_in_the_Neighborhood-3224458',
 '/movie/A_Beautiful_Life-15282148',
 '/movie/A_Beautiful_Mind-268978',
 '/movie/A_Beautiful_Now-2611160',
 '/movie/A_Beautiful_Place_to_Die_A_Marthas_Vineyard_Mystery-10768536',
 '/movie/A_Beautiful_Planet-2800050',
 '/movie/A_Beautiful_Secret-316562',
 '/movie/A_Bee_in_August-970946',
 '/movie/A_Belfast_Story-2326204',
 '/movie/A_Belle_for_Christmas-3256812',
 '/movie/A_Beloved_Wife-11288650',
 '/movie/A_Bennett_Song_Holiday-8552834',
 '/movie/A_Be

###### Scratch each page

In [13]:
for link in links:
    try:
        root = 'https://subslikescript.com'
        website = f'{root}/{link}'
        result = requests.get(website)
        content = result.text
        soup = BeautifulSoup(content,'html.parser')
        box = soup.find('article',{'class':'main-article'})
        title = box.find('h1').get_text()
        transcript = box.find('div',{'class':'full-script'}).get_text(strip = True , seprator = '')
        filename = sanitize(title) + '.txt'
        with open(filename , 'w' , encoding='utf-8') as file:
            file.write(transcript)
        print(f'Saved - {filename}')
    except Exception as e:
        print('link not working',link)
        print('error',e)

link not working /movie/A_1000000000000000_Ransom-4926482
error 'NoneType' object has no attribute 'find'
link not working /movie/A_2nd_Chance-2150139
error 'NoneType' object has no attribute 'get_text'
link not working /movie/A_Aa-5684466
error 'NoneType' object has no attribute 'find'
link not working /movie/A_Baby_at_any_Cost-15331880
error 'NoneType' object has no attribute 'find'
link not working /movie/A_Babysitters_Guide_to_Monster_Hunting-4844150
error 'NoneType' object has no attribute 'find'
link not working /movie/A_Bad_Son-81678
error 'NoneType' object has no attribute 'find'
link not working /movie/A_Bag_of_Marbles-153414
error 'NoneType' object has no attribute 'get_text'
link not working /movie/A_Balloon_for_Allah-1880111
error 'NoneType' object has no attribute 'get_text'
link not working /movie/A_Banana_At_This_Time_of_Night-9010228
error 'NoneType' object has no attribute 'get_text'
link not working /movie/A_Banquet-11400902
error 'NoneType' object has no attribute 'g