In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pdb
import time

# Add line break function from https://gist.github.com/zmwangx/ad0830ba94b1fd98f428
def text_with_newlines(elem):
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e.strip()
        elif e.name == 'br' or e.name == 'p':
            text += '\n'
    return text

In [None]:
years = range(1971,1981)
months = ['04', '10']
home_page = 'https://www.churchofjesuschrist.org'

var_names = ['year', 'month', 'speaker', 'title', 'text']
conf_data = []

for year in years:
    for month in months:
        start_time_month = time.time()
        if year==2020 and month=='10': continue
        
        # Download conference html file for given date
        conf_page = requests.get(f'{home_page}/general-conference/{year}/{month}?lang=eng')
        conf_soup = BeautifulSoup(conf_page.content, 'html.parser')
        
        # Information for each talk in given conference
        links = conf_soup.find_all(class_="lumen-tile__link")
        for link_contents in links:
            try:
                #speaker_name = link_contents.img.get('alt')
                speaker_name = link_contents.find('div', class_='lumen-tile__content').string
                subdir = link_contents.get('href')
                
                # Skip talk if there is no html site with talk text (usually general women's meeting)
                if re.search('media', subdir) != None:
                    print(f'Missing talk text: {speaker_name}, {month} {year}')
                    continue
            except:
                print('Exception: Problem getting speaker name')
                print(f'{month} {year} Conference')
                print(link_contents)
                pdb.set_trace()
            
            # Get talk text
            talk_page = requests.get(f'{home_page}{subdir}')
            talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
            talk_title = talk_soup.title.string
            
            # Replace lettered references with reference content
            references = talk_soup.find_all(id=re.compile('note[0-9]+'))
            
            # Combine references that are from same note
            # n_refs = int( re.search('(?<=note)[0-9]+', references[-1].get('id')).group(0) )
            # if len(references) > n_refs:
            new_references = []
            note = ''
            for ref in references:
                # If this is the beginning of a section of a new footnote, append as new footnote
                new_string = ' '.join(ref.stripped_strings)
                next_note = re.search('note[0-9]+', ref.get('id')).group(0)
                # If the next section is part of the same footnote, add to same footnote
                if note == next_note:
                    new_references[-1] = ' '.join([new_references[-1], new_string])
                else:
                    new_references.append(new_string)
                note = next_note
                    
            references = new_references
            
            talk_refs = talk_soup.find_all(href=re.compile('note[0-9]+'))
            for i, new_string in enumerate(references):
                #new_string = ' '.join(ref.stripped_strings)
                '''if ref.find('a') != None:
                    new_string = ref.find('a').string
                else:
                    new_string = ' '.join(ref.stripped_strings)'''
                # Replace number in talk body text with formatted citation, be it scripture or other reference
                try:
                    talk_refs[i].string.replace_with(f' ({new_string}) ')
                except:
                    print('Exception: Problem replacing reference string')
                    print(f'{month} {year} Conference')
                    print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                    print(f'Talk total references: {len(references)}, Currently on: {i}')
                    pdb.set_trace()
            '''
            # FOR DEBUGGING
            print(f'{speaker_name}, {talk_title}')
            for i, ref in enumerate(references):
                print(f'Reference {i}:', ref)
                print(f'Found in Text:', talk_refs[i], '\n')
            '''
            # Get talk body text after adding references
            try:
                #talk_text = text_with_newlines(talk_soup.find(class_="body-block"))#.get_text()
                talk_text = talk_soup.find(class_='body-block').get_text(separator=' ', strip=True)
            except:
                print('Exception: Problem obtaining body text')
                print(f'{month} {year} Conference')
                print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                pdb.set_trace()
            conf_data.append((year, month, speaker_name, talk_title, talk_text))
        time_min_month = round( (time.time()-start_time_month)/60, 2)
        print(f'Extracted {month} {year} conference talks in {time_min_month} minutes\n')
            
conf_df = pd.DataFrame(conf_data, columns=var_names)
conf_df.to_csv('conference.csv', index=False)

Extracted 04 1970 conference talks in 0.01 minutes

Extracted 10 1970 conference talks in 0.06 minutes

Missing talk text: Elder Joseph Anderson, 04 1971
Missing talk text: Elder Wilford G. Edling, 04 1971
Missing talk text: President N. Eldon Tanner, 04 1971
Extracted 04 1971 conference talks in 0.76 minutes

Missing talk text: President Harold B. Lee, 10 1971


In [15]:
for i, name in enumerate(conf_df.speaker):
    print(name)
    if (i+1)%50 == 0:
        pdb.set_trace()

Spencer W. Kimball
Marvin J. Ashton
Neal A. Maxwell
Bruce R. McConkie
N. Eldon Tanner
Francis M. Gibbons
Wilford G. Edling
N. Eldon Tanner
Thomas S. Monson
David B. Haight
James A. Cullimore
George P. Lee
Eldred G. Smith
Ronald E. Poelman
Ezra Taft Benson
Howard W. Hunter
Robert L. Simpson
Henry D. Taylor
Marion G. Romney
N. Eldon Tanner
Spencer W. Kimball
Marion G. Romney
L. Tom Perry
John H. Vandenberg
O. Leslie Stone
Gordon B. Hinckley
Mark E. Petersen
Gene R. Cook
Sterling W. Sill
Joseph Anderson
Derek A. Cuthbert
Robert L. Backman
Rex C. Reeve
LeGrand Richards
Spencer W. Kimball
Spencer W. Kimball
J. Richard Clarke
Barbara B. Smith
A. Theodore Tuttle
Victor L. Brown
Boyd K. Packer
N. Eldon Tanner
Marion G. Romney
Spencer W. Kimball
Boyd K. Packer
Rex D. Pinegar
Howard W. Hunter
Marion G. Romney
N. Eldon Tanner
Gordon B. Hinckley
> <ipython-input-15-f249905d21d4>(1)<module>()
-> for i, name in enumerate(conf_df.speaker):
(Pdb) c
James E. Faust
F. Burton Howard
Ted E. Brewerton
Jack