In [21]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pdb

# Add line break function from https://gist.github.com/zmwangx/ad0830ba94b1fd98f428
def text_with_newlines(elem):
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e.strip()
        elif e.name == 'br' or e.name == 'p':
            text += '\n'
    return text

In [None]:
years = range(1971,1981)
months = ['04', '10']
home_page = 'https://www.churchofjesuschrist.org'

var_names = ['year', 'month', 'speaker', 'title', 'text']
conf_data = []

for year in years:
    for month in months:
        if year==2020 and month=='10': continue
        
        # Download conference html file for given date
        conf_page = requests.get(f'{home_page}/general-conference/{year}/{month}?lang=eng')
        conf_soup = BeautifulSoup(conf_page.content, 'html.parser')
        
        # Information for each talk in given conference
        links = conf_soup.find_all(class_="lumen-tile__link")
        for link_contents in links:
            try:
                #speaker_name = link_contents.img.get('alt')
                speaker_name = link_contents.find('div', class_='lumen-tile__content').string
                subdir = link_contents.get('href')
                
                # Skip talk if there is no html site with talk text (usually general women's meeting)
                if re.search('media', subdir) != None:
                    print('No text available:', f'{home_page}{subdir}')
                    continue
            except:
                print('Exception: Problem getting speaker name')
                print(f'{month} {year} Conference')
                print(link_contents)
                pdb.set_trace()
            
            # Get talk text
            talk_page = requests.get(f'{home_page}{subdir}')
            talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
            talk_title = talk_soup.h1.string
            
            # Replace lettered references with reference content
            references = talk_soup.find_all(id=re.compile('note[0-9]+'))
            
            # Combine references that are from same note
            # n_refs = int( re.search('(?<=note)[0-9]+', references[-1].get('id')).group(0) )
            # if len(references) > n_refs:
            new_references = []
            note = ''
            for ref in references:
                # If this is the beginning of a section of a new footnote, append as new footnote
                new_string = ' '.join(ref.stripped_strings)
                next_note = re.search('note[0-9]+', ref.get('id')).group(0)
                # If the next section is part of the same footnote, add to same footnote
                if note == next_note:
                    new_references[-1] = ' '.join([new_references[-1], new_string])
                else:
                    new_references.append(new_string)
                note = next_note
                    
            references = new_references
            
            talk_refs = talk_soup.find_all(href=re.compile('note[0-9]+'))
            for i, new_string in enumerate(references):
                #new_string = ' '.join(ref.stripped_strings)
                '''if ref.find('a') != None:
                    new_string = ref.find('a').string
                else:
                    new_string = ' '.join(ref.stripped_strings)'''
                # Replace number in talk body text with formatted citation, be it scripture or other reference
                try:
                    talk_refs[i].string.replace_with(f' ({new_string}) ')
                except:
                    print('Exception: Problem replacing reference string')
                    print(f'{month} {year} Conference')
                    print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                    print(f'Talk total references: {len(references)}, Currently on: {i}')
                    pdb.set_trace()
            '''
            # FOR DEBUGGING
            print(f'{speaker_name}, {talk_title}')
            for i, ref in enumerate(references):
                print(f'Reference {i}:', ref)
                print(f'Found in Text:', talk_refs[i], '\n')
            '''
            # Get talk body text after adding references
            try:
                #talk_text = text_with_newlines(talk_soup.find(class_="body-block"))#.get_text()
                talk_text = talk_soup.find(class_='body-block').get_text(separator=' ', strip=True)
            except:
                print('Exception: Problem obtaining body text')
                print(f'{month} {year} Conference')
                print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                pdb.set_trace()
            conf_data.append((year, month, speaker_name, talk_title, talk_text))
        print(f'Finished extracting {month} {year} conference talks\n')
            
conf_df = pd.DataFrame(conf_data, columns=var_names)
conf_df.to_csv('conference.csv', index=False)

No text available: https://www.churchofjesuschrist.org/general-conference/1971/04/media/1724239030001?lang=eng
No text available: https://www.churchofjesuschrist.org/general-conference/1971/04/media/1724239031001?lang=eng
No text available: https://www.churchofjesuschrist.org/general-conference/1971/04/media/1724239032001?lang=eng
Finished extracting 04 1971 conference talks

No text available: https://www.churchofjesuschrist.org/general-conference/1971/10/media/2338099779001?lang=eng
Finished extracting 10 1971 conference talks

No text available: https://www.churchofjesuschrist.org/general-conference/1972/04/media/1789534103001?lang=eng
No text available: https://www.churchofjesuschrist.org/general-conference/1972/04/media/1789534104001?lang=eng
No text available: https://www.churchofjesuschrist.org/general-conference/1972/04/media/1789534105001?lang=eng
Finished extracting 04 1972 conference talks

No text available: https://www.churchofjesuschrist.org/general-conference/1972/10/med