In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pdb
import time
from random import randint


In [2]:
years = range(1971,2021)
months = ['04', '10']
home_page = 'https://www.churchofjesuschrist.org'

var_names = ['year', 'month', 'speaker', 'title', 'text']
conf_data = []

for year in years:
    for month in months:
        start_time_month = time.time()
        if year==2020 and month=='10': continue
        
        # Download conference html file for given date
        conf_page = requests.get(f'{home_page}/general-conference/{year}/{month}?lang=eng')
        conf_soup = BeautifulSoup(conf_page.content, 'html.parser')
        
        # Information for each talk in given conference
        links = conf_soup.find_all(class_="lumen-tile__link")
        for link_contents in links:
            try:
                speaker_name = link_contents.find('div', class_='lumen-tile__content').string
                subdir = link_contents.get('href')
                
                # Skip talk if there is no html site with talk text (usually general women's meeting)
                if re.search('media', subdir) != None:
                    print(f'Missing talk text: {speaker_name}, {month} {year}')
                    conf_data.append((year, month, speaker_name, None, None))
                    continue
            except:
                print('Exception: Problem getting speaker name')
                print(f'{month} {year} Conference')
                print(link_contents)
                raise
            
            # Refreshing page up to 3 times if an error occurs
            talk_loaded = False
            iterations = 0
            MAX_ITER = 3
            while talk_loaded == False and iterations < MAX_ITER:
                # Get talk text
                talk_page = requests.get(f'{home_page}{subdir}')
                talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
                talk_title = talk_soup.title.string
                
                # Check that GET request was successful
                if iterations == MAX_ITER:
                    print(f'Talk failed loading {MAX_ITER} times\nAborting')
                    raise
                elif talk_title == 'Service Not Available':
                    print(f'WARNING: {month} {year} conference talk by {speaker_name} did not load--{talk_page}')
                    iterations += 1
                    time.sleep(randint(2,5))
                else:
                    talk_loaded = True
            
            # Replace lettered references with reference content
            references = talk_soup.find_all(id=re.compile('note[0-9]+'))
            
            # Combine references that are from same note
            new_references = []
            note = ''
            for ref in references:
                # If this is the beginning of a section of a new footnote, append as new footnote
                new_string = ' '.join(ref.stripped_strings)
                next_note = re.search('note[0-9]+', ref.get('id')).group(0)
                # If the next section is part of the same footnote, add to same footnote
                if note == next_note:
                    new_references[-1] = ' '.join([new_references[-1], new_string])
                else:
                    new_references.append(new_string)
                note = next_note
            
            references = new_references
            
            # Replace number in talk body text with formatted citation, be it scripture or other reference
            talk_refs = talk_soup.find_all(href=re.compile('note[0-9]+'))
            for i, new_string in enumerate(references):
                try:
                    talk_refs[i].string.replace_with(f' ({new_string}) ')
                except:
                    print('Exception: Problem replacing reference string')
                    print(f'{month} {year} Conference')
                    print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                    print(f'Talk total references: {len(references)}, Currently on: {i}')
                    raise
            
            # Get talk body text after adding references
            try:
                talk_text = talk_soup.find(class_='body-block').get_text(separator=' ', strip=True)
            except:
                print('Exception: Problem obtaining body text')
                print(f'{month} {year} Conference')
                print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                raise
            conf_data.append((year, month, speaker_name, talk_title, talk_text))
        
        # Compute and report data scraping time for each conference session
        time_min_month = round( (time.time()-start_time_month)/60, 2)
        print(f'Extracted {month} {year} conference talks in {time_min_month} minutes\n')
            
conf_df = pd.DataFrame(conf_data, columns=var_names)
conf_df.to_csv('conference.csv', index=False)

Missing talk text: Elder Joseph Anderson, 04 1971
Missing talk text: Elder Wilford G. Edling, 04 1971
Missing talk text: President N. Eldon Tanner, 04 1971
Extracted 04 1971 conference talks in 0.48 minutes

Missing talk text: President Harold B. Lee, 10 1971
Extracted 10 1971 conference talks in 0.55 minutes

Missing talk text: Elder Francis M. Gibbons, 04 1972
Missing talk text: Elder Wilford G. Edling, 04 1972
Missing talk text: President N. Eldon Tanner, 04 1972
Extracted 04 1972 conference talks in 0.6 minutes

Missing talk text: President N. Eldon Tanner, 10 1972
Extracted 10 1972 conference talks in 0.62 minutes

Missing talk text: Elder Francis M. Gibbons, 04 1973
Missing talk text: Wilford G. Edling, 04 1973
Missing talk text: President N. Eldon Tanner, 04 1973
Extracted 04 1973 conference talks in 0.53 minutes

Missing talk text: President N. Eldon Tanner, 10 1973
Extracted 10 1973 conference talks in 0.55 minutes

Missing talk text: Elder Francis M. Gibbons, 04 1974
Missing 

Extracted 04 2015 conference talks in 0.38 minutes

Missing talk text: The Church of Jesus Christ of Latter-day Saints, 10 2015
Missing talk text: The Church of Jesus Christ of Latter-day Saints, 10 2015
Extracted 10 2015 conference talks in 0.35 minutes

Missing talk text: Video Presentation, 04 2016
Missing talk text: Video Presentation, 04 2016
Extracted 04 2016 conference talks in 0.52 minutes

Extracted 10 2016 conference talks in 0.38 minutes

Extracted 04 2017 conference talks in 0.43 minutes

Extracted 10 2017 conference talks in 0.36 minutes

Extracted 04 2018 conference talks in 0.43 minutes

Extracted 10 2018 conference talks in 0.32 minutes

Extracted 04 2019 conference talks in 0.28 minutes

Extracted 10 2019 conference talks in 0.39 minutes

Extracted 04 2020 conference talks in 0.27 minutes

