# Ramsey King
# DSC 680 - Applied Data Science
# April 9 - 2022

###  This Jupyter file will pull the talk information down from the websites speeches.byu.edu and www.churchofjesuschrist.org/study/general-conference/speakers

In [40]:
# Retrieve talks by speaker on speeches.byu.edu using BeautifulSoup

from bs4 import BeautifulSoup as bs
import requests
# import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [41]:
# Get list of urls for each of the topics
# For my reference, number of talks by speaker:  Nelson - 22, Maxwell - 30, Holland - 27, Hinckley - 41, Monson - 14, Oaks - 35, Eyring - 25 (Total - 194)
url = "https://speeches.byu.edu/speakers/"

url_list = [
    'russell-m-nelson/', 'neal-a-maxwell/', 'jeffrey-r-holland/', 'gordon-b-hinckley/', 'thomas-s-monson/',
    'dallin-h-oaks', 'henry-b-eyring'
]

links = []
new_links = []

# some of the links are repeated with '?M=V' and '?M=A' and those will be removed so we can have a unique set of links to pull talk information from.

for u in url_list:
    response = requests.get(url+u)
    html = response.content
    video = '?M=V'
    soup = bs(html, 'lxml')
    for link in soup.findAll("a", attrs={'href': re.compile("^https://speeches.byu.edu/talks/[\D+]")}):
        links.append(link.get('href').replace(video, ''))

    audio = '?M=A'
    for link in links:
        new_link = link.replace(audio, '')
        new_links.append(new_link)
    new_links = list(set(new_links))



In [42]:
len(new_links)

195

In [43]:
# We may have one duplicate, so something to be aware when we pull the talk data.

'''Retrieves talk text from each of the urls using beautiful soup.  Information such as the talk/sermon text, title, topic, date, and speaker will be retreived
and put into a dataframe.  A csv file will also be saved due to the time it takes to run or in case we need to work offline.'''
talk_text = []
title_text = []
topics = []
dates = []
speakers = []
speaker_position = []

for link in new_links:
    url = link
    response = requests.get(url)
    html = response.content
    soup = bs(html, 'lxml')
    temp_list = []

    link_splitter = link.split('/')
    speakers.append(link_splitter[4].replace('-', ' '))

    result = soup.find('p', class_='single-speech__speaker-subtext single-speech__speaker-position')
    speaker_position.append(result.text if result else 'Title Not Found')

    for div in soup.find_all('div', class_='single-speech__content'):
        talk_text.append(div.get_text(strip=True))

dict_speeches_byu = {'speaker': speakers,'talks': talk_text}
df = pd.DataFrame(dict_speeches_byu)

df.sample(15)

Unnamed: 0,speaker,talks
149,neal a maxwell,I welcome you to a Christian campus where disc...
153,jeffrey r holland,I really don’t want to apologize for being her...
79,neal a maxwell,I give my sincere appreciation to President Le...
26,gordon b hinckley,It is a wonderful thing we do in honoring the ...
183,henry b eyring,"I am grateful for the music, the prayer, and t..."
101,dallin h oaks,"My dear brothers and sisters—students, teacher..."
145,henry b eyring,"I am grateful for the prayer, for the music, f..."
74,gordon b hinckley,It is an honor and a rare privilege to speak t...
49,thomas s monson,"I drove here today in a snowstorm, and my thou..."
23,dallin h oaks,Sister Oaks and I are glad to be with you this...


In [44]:
# Now to pull information from the church general conference website.

# Get list of urls for each of the topics
gc_url = "https://www.churchofjesuschrist.org/study/general-conference/speakers/"

gc_url_list = [
    'russell-m-nelson/', 'neal-a-maxwell/', 'jeffrey-r-holland/', 'gordon-b-hinckley/', 'thomas-s-monson/',
    'dallin-h-oaks', 'henry-b-eyring'
]

links = []

for u in gc_url_list:
    response = requests.get(gc_url+u)
    html = response.content
    soup = bs(html, 'lxml')
    for link in soup.findAll("a", attrs={'href': re.compile("^/study/general-conference/\w+/(?:\w+/)(?:\w+)")}):
        links.append(link.get('href'))

In [45]:
talk_text = []
speakers = []

for link in links:
    url = 'https://www.churchofjesuschrist.org'+link
    response = requests.get(url)
    html = response.content
    soup = bs(html, 'lxml')

    sermon_name = soup.find_all('div', class_='byline')
    # This is basically a switch statement that will allow for the speaker column to have the same format as the dataframe speeches.byu.edu dataframe created above.
    for x in sermon_name:
        if 'nelson' in x.find('p').text.lower():
            speakers.append('russell m nelson')
        elif 'maxwell' in x.find('p').text.lower():
            speakers.append('neal a maxwell')
        elif 'holland' in x.find('p').text.lower():
            speakers.append('jeffrey r holland')
        elif 'hinckley' in x.find('p').text.lower():
            speakers.append('gordon b hinckley')
        elif 'monson' in x.find('p').text.lower():
            speakers.append('thomas s monson')
        elif 'oaks' in x.find('p').text.lower():
            speakers.append('dallin h oaks')
        else:
            speakers.append('henry b eyring')

    for div in soup.find_all('div', class_='body-block'):
        talk_text.append(div.get_text(separator= ' ', strip=True))

dict_gc = {'speaker': speakers,'talks': talk_text}
df_gc = pd.DataFrame(dict_gc)

df_gc.sample(15)

Unnamed: 0,speaker,talks
25,russell m nelson,"My dear brothers and sisters, we have looked f..."
414,dallin h oaks,"Brothers and sisters, I will now present the G..."
423,dallin h oaks,The restored gospel of Jesus Christ encourages...
4,russell m nelson,"My dear brothers and sisters, I am grateful fo..."
277,gordon b hinckley,"My dearly beloved brethren and sisters, what a..."
220,gordon b hinckley,"My brothers and sisters, as we gather in anoth..."
245,gordon b hinckley,Someone has said: “Be kind to the women. They ...
313,thomas s monson,"My beloved brothers and sisters, both here in ..."
217,gordon b hinckley,"My dear brethren of the priesthood, what an in..."
413,dallin h oaks,In a Saturday evening meeting at a stake confe...


In [46]:
df = df.append(df_gc)
df.to_csv('talk_corpus.csv')
df.sample(50)

Unnamed: 0,speaker,talks
83,thomas s monson,"When we have important decisions to make, the ..."
218,gordon b hinckley,"My beloved brethren and sisters, we have enjoy..."
494,dallin h oaks,Recently our family was viewing what was suppo...
490,dallin h oaks,In April 1830 the Lord commanded the members o...
220,gordon b hinckley,"My brothers and sisters, as we gather in anoth..."
344,thomas s monson,"My dear young sisters, the responsibility to a..."
232,gordon b hinckley,"My brothers and sisters, as we have been remin..."
119,neal a maxwell,"As I raised my arm this morning, I raise my vo..."
68,russell m nelson,Not long after my call to serve as one of the ...
442,dallin h oaks,“What think ye of Christ?” ( Matthew 22:42 ). ...
