In [31]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pdb
import time
from random import randint


In [47]:
years = range(2015,2022)
months = ['04', '10']
home_page = 'https://www.churchofjesuschrist.org'
lang = 'rus'

var_names = ['year', 'month', 'speaker', 'title', 'text']
conf_data = []

for year in years:
    for month in months:
        start_time_month = time.time()
        
        # Download conference html file for given date
        conf_page = requests.get(f'{home_page}/study/general-conference/{year}/{month}?lang={lang}')
        conf_soup = BeautifulSoup(conf_page.content, 'html.parser')
        
        # Information for each talk in given conference
        links = conf_soup.find_all(class_="item-3cCP7")
        for link_contents in links:
            try:
                speaker_name = link_contents.find(class_='subtitle-MuO4X')
                if speaker_name is None:
                    continue
                speaker_name = speaker_name.string
                subdir = link_contents.get('href')
                
                # Skip talk if there is no html site with talk text (usually general women's meeting)
                if re.search('media', subdir) != None:
                    print(f'Missing talk text: {speaker_name}, {month} {year}')
                    conf_data.append((year, month, speaker_name, None, None))
                    continue
            except:
                print('Exception: Problem getting speaker name')
                print(f'{month} {year} Conference')
                print(link_contents)
                raise
                
            title_obj = link_contents.find('div', class_="itemTitle-23vMm")
            title = title_obj.find('p').getText()
#             print(f"\nCurrent Talk: {title}")
#             print(f"Date: {month}, {year}")
#             print(f"Speaker: {speaker_name}")
            
            # Refreshing page up to 3 times if an error occurs
            talk_loaded = False
            iterations = 0
            MAX_ITER = 3
            while talk_loaded == False and iterations < MAX_ITER:
                # Get talk text
                talk_page = requests.get(f'{home_page}{subdir}')
                talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
                talk_title = talk_soup.title.string
                
                # Check that GET request was successful
                if iterations == MAX_ITER:
                    print(f'Talk failed loading {MAX_ITER} times\nAborting')
                    raise
                elif talk_title == 'Service Not Available':
                    print(f'WARNING: {month} {year} conference talk by {speaker_name} did not load--{talk_page}')
                    iterations += 1
                    time.sleep(randint(2,5))
                else:
                    talk_loaded = True
            
            # Replace lettered references with reference content
#             references = talk_soup.find_all(id=re.compile('note[0-9]+'))
            
#             # Combine references that are from same note
#             new_references = []
#             note = ''
#             for ref in references:
#                 # If this is the beginning of a section of a new footnote, append as new footnote
#                 new_string = ' '.join(ref.stripped_strings)
#                 next_note = re.search('note[0-9]+', ref.get('id')).group(0)
#                 # If the next section is part of the same footnote, add to same footnote
#                 if note == next_note:
#                     new_references[-1] = ' '.join([new_references[-1], new_string])
#                 else:
#                     new_references.append(new_string)
#                 note = next_note
            
#             references = new_references
            
#             # Replace number in talk body text with formatted citation, be it scripture or other reference
#             talk_refs = talk_soup.find_all(href=re.compile('note[0-9]+'))
#             for i, new_string in enumerate(references):
#                 try:
#                     talk_refs[i].string.replace_with(f' ({new_string}) ')
#                 except:
#                     print('Exception: Problem replacing reference string')
#                     print(f'{month} {year} Conference')
#                     print(f'Speaker: {speaker_name}, Talk: {talk_title}')
#                     print(f'Talk total references: {len(references)}, Currently on: {i}')
#                     raise
            
            # Get talk body text after adding references
            try:
                talk_text = talk_soup.find(class_='body-block').get_text(separator=' ', strip=True)
            except:
                print('Exception: Problem obtaining body text')
                print(f'{month} {year} Conference')
                print(f'Speaker: {speaker_name}, Talk: {talk_title}')
                raise
            conf_data.append((year, month, speaker_name, talk_title, talk_text))
        
        # Compute and report data scraping time for each conference session
        time_min_month = round( (time.time()-start_time_month)/60, 2)
        print(f'Extracted {month} {year} conference talks in {time_min_month} minutes\n')
            
conf_df = pd.DataFrame(conf_data, columns=var_names)
conf_df.to_csv(f'conference_{lang}.csv', index=False)

Extracted 04 2015 conference talks in 0.29 minutes

Extracted 10 2015 conference talks in 0.24 minutes

Extracted 04 2016 conference talks in 0.31 minutes

Extracted 10 2016 conference talks in 0.34 minutes

Extracted 04 2017 conference talks in 0.2 minutes

Extracted 10 2017 conference talks in 0.19 minutes

Extracted 04 2018 conference talks in 0.24 minutes

Extracted 10 2018 conference talks in 0.24 minutes

Extracted 04 2019 conference talks in 2.22 minutes

Extracted 10 2019 conference talks in 0.33 minutes

Extracted 04 2020 conference talks in 0.52 minutes

Extracted 10 2020 conference talks in 0.31 minutes

Extracted 04 2021 conference talks in 0.34 minutes

Extracted 10 2021 conference talks in 0.01 minutes



In [36]:
conf_df

Unnamed: 0,year,month,speaker,title,text
0,2015,04,Шерил А. Эсплин,Наполним наши дома светом и истиной,"Мое сердце наполнил Дух, когда я слушала, как ..."
1,2015,04,Кэрол М. Стивенс,Божья семья,"Что может быть прекраснее и великолепнее, чем ..."
2,2015,04,Бонни Л. Оскарсон,Защитники Воззвания о семье,Как почетно и как радостно быть причастной к э...
3,2015,04,Президент Генри Б. Айринг,Утешитель,"Мои возлюбленные сестры, это огромная радость ..."
4,2015,04,Президент Генри Б. Айринг,"«Вот пост, который Я избрал»","Мои дорогие братья и сестры, я радуюсь возможн..."
...,...,...,...,...,...
472,2021,04,Тимоти Дж. Дайчес,Свет придерживается света,"Мои дорогие братья и сестры, я радуюсь вместе ..."
473,2021,04,Д. Тодд Кристоферсон,Почему мы следуем путем заветов,На протяжении всего своего служения Президент ...
474,2021,04,Алан Р. Уолкер,Людям свет Евангелия дан,Прекрасный гимн Святых последних дней «Голос Н...
475,2021,04,Дэвид A. Беднар,«Принципы Евангелия Моего»,В октябре 1849 года на Генеральной конференции...
